<a href="https://colab.research.google.com/github/rostro36/Vernehmlassungen/blob/master/Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict decision time

## Load data and preprocess

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
URL=r'https://raw.githubusercontent.com/rostro36/Vernehmlassungen/master/laws.csv'
df=pd.read_csv(URL)

In [2]:
df.head(100)

Unnamed: 0.1,Unnamed: 0,index,Department,Title,Text,Vernehmlassung_Day,Vernehmlassung_Month,Vernehmlassung_Year,Behoerde,SR_Links,SR_Numbers,Link_count,index.1,Months_until_decision,Decision_day,Decision_month,Decision_year,Months_until_accept,Accept_day,Accept_month,Accept_year
0,0,0,BK,Revision des Bundesgesetzes über die politisch...,,28,2,1993,Bundesrat,,,,0,,,,,,,,
1,1,1,EDA,Beitritt der Schweiz zum UNO-Übereinkommen übe...,,15,12,1992,Bundesrat,,,,1,,,,,,,,
2,2,2,EDI,Verordnung über den Wald (Waldverordnung),,16,3,1992,Bundesrat,,,,2,,,,,,,,
3,3,3,EDI,Beitritt der Schweiz zu drei internationalen B...,,15,6,1992,Bundesrat,,,,3,,,,,,,,
4,4,4,EDI,Bundesbeschluss über befristete Massnahmen geg...,,30,6,1992,Bundesrat,,,,4,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,13,EFD,Verzinsung von Verrechnungssteuerguthaben (Var...,Es stehen zwei Varianten von Gesetzesentwürfen...,15,6,1995,Bundesrat,,,,95,,,,,,,,
96,96,14,EFD,Bundesbeschluss über die Anordnung einer allge...,,30,6,1995,Bundesrat,,,,96,,,,,,,,
97,97,15,EFD,Verordnung über das öffentliche Beschaffungswesen,,18,9,1995,Bundesrat,,,,97,,,,,,,,
98,98,16,EFD,Finanzierung des öffentlichen Verkehrs,"Der Bundesrat sieht vor, die drei Finanzierung...",15,11,1995,Bundesrat,,,,98,,,,,,,,


In [3]:
df =df.dropna(subset=['SR_Links', 'Months_until_accept', 'Months_until_decision'])
no_text = df.drop(columns=['Title','Text', 'Unnamed: 0', 'index', 'index.1', 'SR_Links', 'SR_Numbers', 'Months_until_accept', 'Accept_day', 'Accept_month','Accept_year']).reset_index()
encoder = OneHotEncoder(sparse=False)
encoded=encoder.fit_transform(no_text[['Behoerde', 'Department']])
encoded = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())
no_text=no_text.drop(columns=['Behoerde', 'Department', 'index'])
no_text=pd.concat([no_text, encoded], axis=1)
no_text.tail()

Unnamed: 0,Vernehmlassung_Day,Vernehmlassung_Month,Vernehmlassung_Year,Link_count,Months_until_decision,Decision_day,Decision_month,Decision_year,Behoerde_Behördenkommission,Behoerde_Bundesrat,Behoerde_Bundesversammlung,Behoerde_Departement oder Bundeskanzlei,Behoerde_Einheit der zentralen oder dezentralen Bundesverwaltung,Behoerde_Parlamentarische Kommissionen,Department_BK,Department_EDA,Department_EDI,Department_EFD,Department_EJPD,Department_EVD,Department_Parl.,Department_UVEK,Department_VBS,Department_WBF,Department_other
937,6,7,2015,2.0,15.066667,30.0,9.0,2016.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
938,14,8,2015,1.0,10.266667,17.0,6.0,2016.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
939,18,12,2015,1.0,6.066667,17.0,6.0,2016.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
940,15,3,2016,1.0,3.133333,17.0,6.0,2016.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
941,21,3,2016,1.0,12.033333,17.0,3.0,2017.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Overfitting test
Check if everything works by already giving the decision data, which we want to predict.

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from tensorflow import keras
import tensorflow as tf


targets= no_text['Months_until_decision']
features = no_text.drop(columns=['Months_until_decision'])
features_training, features_test, targets_training, targets_test = train_test_split(features, targets, test_size=0.2, random_state=42)

parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':[0.1, 1, 10]}
clf = GridSearchCV(SVR(), parameters, n_jobs=-1, cv=5, verbose=3, scoring='neg_mean_squared_error')
clf.fit(features_training, targets_training)
predicted_test = clf.predict(features_test)
print(mean_squared_error(targets_test, predicted_test))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
0.0025745685565871463


In [5]:
tf.random.set_seed(12)
model=keras.Sequential([keras.layers.Dense(10, activation='ReLU', input_shape=(24,)), keras.layers.Dropout(0.1), keras.layers.Dense(1, activation='ReLU')])
model.build()
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')
model.fit(features_training, targets_training, batch_size=1, epochs=10)
predicted_test=model.predict(features_test)
print(mean_squared_error(targets_test, predicted_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
965.7047648303027


## Not overfitting
Excluding the exact decision date.

### SVM

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

targets= no_text['Months_until_decision']
features = no_text.drop(columns=['Months_until_decision', 'Decision_day', 'Decision_month', 'Decision_year'])
features_training, features_test, targets_training, targets_test = train_test_split(features, targets, test_size=0.2, random_state=42)

parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':[0.1, 1, 10]}
clf = GridSearchCV(SVR(), parameters, n_jobs=-1, cv=5, verbose=3)
clf.fit(features_training, targets_training)
print('SVM')
print(clf.best_params_)
predicted_test = clf.predict(features_test)
print(mean_squared_error(predicted_test,targets_test))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
SVM
{'C': 10, 'kernel': 'linear'}
1349.3433860982645


### Decision tree

In [7]:
parameters = {'max_depth':(3, 6, 12, 25, None), 'min_samples_leaf':[1,3,7]}
clf = GridSearchCV(DecisionTreeRegressor(random_state=12), parameters, n_jobs=-1, cv=5, verbose=3)
clf.fit(features_training, targets_training)
print('Tree')
print(clf.best_params_)
predicted_test = clf.predict(features_test)
print(mean_squared_error(predicted_test,targets_test))

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Tree
{'max_depth': 3, 'min_samples_leaf': 7}
1184.0546846776547


### Nearest Neighbour

In [8]:
parameters = {'n_neighbors':(3, 5, 7, 11), 'weights':['uniform', 'distance']}
clf = GridSearchCV(KNeighborsRegressor(), parameters, n_jobs=-1, cv=5, verbose=4)
clf.fit(features_training, targets_training)
print('Nearest Neighbour')
print(clf.best_params_)
predicted_test = clf.predict(features_test)
print(mean_squared_error(predicted_test,targets_test))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Nearest Neighbour
{'n_neighbors': 11, 'weights': 'uniform'}
1223.551544837504
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Random Forest
{'max_depth': 3, 'min_samples_leaf': 3, 'n_estimators': 100}
1117.5959260251711
Fitting 5 folds for each of 180 candidates, totalling 900 fits
Boosting
{'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'n_estimators': 10}
1146.7757638073708


### Random Forest

In [9]:
parameters = {'n_estimators':(10, 50, 100), 'max_depth':(3, 6, 12, 25, None), 'min_samples_leaf':[1,3,7]}
clf = GridSearchCV(RandomForestRegressor(random_state=12), parameters, n_jobs=-1, cv=5, verbose=4)
clf.fit(features_training, targets_training)
print('Random Forest')
print(clf.best_params_)
predicted_test = clf.predict(features_test)
print(mean_squared_error(predicted_test,targets_test))

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Random Forest
{'max_depth': 3, 'min_samples_leaf': 3, 'n_estimators': 100}
1117.5959260251711


### Gradient Boosting

In [10]:
parameters = {'learning_rate':(0.001, 0.01, 0.1, 0.4), 'n_estimators':(10, 50, 100), 'max_depth':(3, 6, 12, 25, None), 'min_samples_leaf':[1,3,7]}
clf = GridSearchCV(GradientBoostingRegressor(random_state=12), parameters, n_jobs=-1, cv=5, verbose=4)
clf.fit(features_training, targets_training)
print('Boosting')
print(clf.best_params_)
predicted_test = clf.predict(features_test)
print(mean_squared_error(predicted_test,targets_test))

Fitting 5 folds for each of 180 candidates, totalling 900 fits
Boosting
{'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'n_estimators': 10}
1146.7757638073708
