In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import datasets
import numpy as np
import pandas as pd

train_set8_y = pd.read_pickle("train_y_features8.pkl")
train_set8_x = pd.read_pickle("train_x_features8.pkl")

# Initializing Classifiers
clf1 = LogisticRegression(random_state=1, class_weight = 'balanced')
clf2 = RandomForestClassifier(random_state=1, class_weight = 'balanced')
clf3 = GaussianNB()
clf4 = SVC(class_weight = 'balanced')

# Loading some example data

train_x_set8 = train_set8_x.loc[:, ['tfidf_cosine', 'tfidf_bigram_cosine', 'tfidf_trigram_cosine', 'count_vec_cosine', 'lda_50topics_cosine', 'lda_100topics_cosine', 'lda_200topics_cosine']].values
train_y_set8 = train_set8_y['label'].values

In [9]:
test_x_set8 = pd.read_pickle("test_x_features8.pkl")
test_y_set8 = pd.read_pickle("test_y_features8.pkl")
test_x_set8_xgboost = test_x_set8.loc[:, ['tfidf_cosine', 'tfidf_bigram_cosine', 'tfidf_trigram_cosine', 'count_vec_cosine', 'lda_50topics_cosine', 'lda_100topics_cosine', 'lda_200topics_cosine']].values
test_y_set8_xgboost = test_y_set8['label'].values

In [10]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [11]:
clf1.fit(train_x_set8, train_y_set8)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=1,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [12]:
predictions1 = clf1.predict(test_x_set8)

In [13]:
print('Logistic Regression:')
print('Accuracy:', accuracy_score(test_y_set8, predictions1))
print('F1 score:', f1_score(test_y_set8, predictions1))
print('Recall:', recall_score(test_y_set8, predictions1))
print('Precision:', precision_score(test_y_set8, predictions1))

Logistic Regression:
Accuracy: 0.8156813999683155
F1 score: 0.030510864688161016
Recall: 0.7168674698795181
Precision: 0.015587137337088218


In [14]:
clf2.fit(train_x_set8, train_y_set8)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [16]:
predictions2 = clf2.predict(test_x_set8)

In [17]:
print('Random Forest:')
print('Accuracy:', accuracy_score(test_y_set8, predictions2))
print('F1 score:', f1_score(test_y_set8, predictions2))
print('Recall:', recall_score(test_y_set8, predictions2))
print('Precision:', precision_score(test_y_set8, predictions2))

Random Forest:
Accuracy: 0.9935412325278153
F1 score: 0.09863945578231292
Recall: 0.08734939759036145
Precision: 0.11328125


In [18]:
clf3.fit(train_x_set8, train_y_set8) 

GaussianNB(priors=None)

In [19]:
predictions3 = clf3.predict(test_x_set8)

In [20]:
print('GaussianNB:')
print('Accuracy:', accuracy_score(test_y_set8, predictions3))
print('F1 score:', f1_score(test_y_set8, predictions3))
print('Recall:', recall_score(test_y_set8, predictions3))
print('Precision:', precision_score(test_y_set8, predictions3))

GaussianNB:
Accuracy: 0.9373743282272511
F1 score: 0.05480963766783152
Recall: 0.44879518072289154
Precision: 0.02918707149853085


In [21]:
clf4.fit(train_x_set8, train_y_set8) 

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [22]:
predictions4 = clf4.predict(test_x_set8)

In [23]:
print('SVM:')
print('Accuracy:', accuracy_score(test_y_set8, predictions4))
print('F1 score:', f1_score(test_y_set8, predictions4))
print('Recall:', recall_score(test_y_set8, predictions4))
print('Precision:', precision_score(test_y_set8, predictions4))

SVM:
Accuracy: 0.826612559256145
F1 score: 0.032635300516725585
Recall: 0.7228915662650602
Precision: 0.01669449081803005


In [24]:
from xgboost import XGBClassifier



In [25]:
# fit xgboost model no training data
model = XGBClassifier(class_weight = 'balanced')
model.fit(train_x_set8, train_y_set8)

TypeError: __init__() got an unexpected keyword argument 'class_weight'

In [None]:
predictions5 = model.predict(test_x_set8_xgboost)

In [None]:
print('XGBoost:')
print('Accuracy:', accuracy_score(test_y_set8_xgboost, predictions5))
print('F1 score:', f1_score(test_y_set8_xgboost, predictions5))
print('Recall:', recall_score(test_y_set8_xgboost, predictions5))
print('Precision:', precision_score(test_y_set8_xgboost, predictions5))

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1, class_weight = 'balanced')

In [None]:
mlp.fit(train_x_set8, train_y_set8)

In [None]:
predictions6 = mlp.predict(test_x_set8_xgboost)

In [None]:
print('MLP - multi-layer perceptron:')
print('Accuracy:', accuracy_score(test_y_set8_xgboost, predictions6))
print('F1 score:', f1_score(test_y_set8_xgboost, predictions6))
print('Recall:', recall_score(test_y_set8_xgboost, predictions6))
print('Precision:', precision_score(test_y_set8_xgboost, predictions6))