## Data Preperation

In [40]:
import pandas as pd
from sklearn.preprocessing import scale
from keras.utils import to_categorical

text_features_df = pd.read_hdf('../data/weebit/weebit.h5', 'text_features_df')
train_features_df = pd.read_hdf('../data/weebit/weebit.h5', 'train_features_df')
test_features_df = pd.read_hdf('../data/weebit/weebit.h5', 'test_features_df')

features_mask = text_features_df.columns.str.startswith('feature_')
y_mask = text_features_df.columns == 'y'
features_y_mask = features_mask | y_mask

X_all = text_features_df.loc[:, features_mask]
y_all = text_features_df['y']
y_all_onehot = to_categorical(y_all)

X_train = train_features_df.loc[:, features_mask]
y_train = train_features_df['y']
y_train_onehot = to_categorical(y_train)

X_test = test_features_df.loc[:, features_mask]
y_test = test_features_df['y']
y_test_onehot = to_categorical(y_test)

X_all = scale(X_all)
X_train = scale(X_train)
X_test = scale(X_test)

In [70]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

## Logistic Regression

In [72]:
from sklearn.linear_model import LogisticRegressionCV

log_reg = LogisticRegressionCV(random_state=42)
log_reg.fit(X_train, y_train)
print('Acc {:0.3f} {:0.3f}'.format(log_reg.score(X_train, y_train) , log_reg.score(X_test, y_test)))

Acc 0.669 0.657


## SVM

In [5]:
from sklearn.model_selection import GridSearchCV
Cs = log_reg.Cs_

In [14]:
from sklearn.svm import LinearSVC

linear_svm = GridSearchCV(LinearSVC(random_state=42), {'C': Cs})
linear_svm.fit(X_train, y_train)
print('C {:0.3f}'.format(linear_svm.best_params_['C']))
print('Acc {:0.3f} {:0.3f}'.format(linear_svm.score(X_train, y_train) , linear_svm.score(X_test, y_test)))

C 0.046
Acc 0.662 0.658


In [19]:
from sklearn.svm import SVC

rbf_svm = GridSearchCV(SVC(random_state=42), {'C': Cs})
rbf_svm.fit(X_train, y_train)
print('C {:0.3f}'.format(rbf_svm.best_params_['C']))
print('Acc {:0.3f} {:0.3f}'.format(rbf_svm.score(X_train, y_train) , rbf_svm.score(X_test, y_test)))

C 2.783
Acc 0.886 0.687


In [20]:
poly_svm = GridSearchCV(SVC(random_state=42), {'C': Cs})
poly_svm.fit(X_train, y_train)
print('C {:0.3f}'.format(poly_svm.best_params_['C']))
print('Acc {:0.3f} {:0.3f}'.format(poly_svm.score(X_train, y_train) , poly_svm.score(X_test, y_test)))

C 2.783
Acc 0.886 0.687


In [21]:
sigmoid_svm = GridSearchCV(SVC(random_state=42), {'C': Cs})
sigmoid_svm.fit(X_train, y_train)
print('C {:0.3f}'.format(sigmoid_svm.best_params_['C']))
print('Acc {:0.3f} {:0.3f}'.format(sigmoid_svm.score(X_train, y_train) , sigmoid_svm.score(X_test, y_test)))

C 2.783
Acc 0.886 0.687


## Trees

In [28]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [31]:
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
#print('C {:0.3f}'.format(sigmoid_svm.best_params_['C']))
print('Acc {:0.3f} {:0.3f}'.format(random_forest.score(X_train, y_train) , random_forest.score(X_test, y_test)))

Acc 0.989 0.606


In [30]:
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
#print('C {:0.3f}'.format(sigmoid_svm.best_params_['C']))
print('Acc {:0.3f} {:0.3f}'.format(gb.score(X_train, y_train) , gb.score(X_test, y_test)))

Acc 0.882 0.679


## MLP

In [89]:
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers

n_features = X_train.shape[1]

model = Sequential()
model.add(Dense(128, input_shape=(n_features,), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(5, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01)))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

print(model.fit(X_train, y_train_onehot, epochs=30, validation_split=0.1))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 128)               4224      
_________________________________________________________________
dense_36 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_37 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_38 (Dense)             (None, 5)                 645       
Total params: 37,893
Trainable params: 37,893
Non-trainable params: 0
_________________________________________________________________
None
Train on 2617 samples, validate on 291 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30

In [90]:
sum(model.predict_classes(X_test) == y_test) / len(y_test)

0.6456043956043956