## Notebook: text_analysis_voting_classifier.ipynb

This notebook is used for building a voting classifier from the best performing version of the other models.

**Uses updated dataset with text analysis.**

In [31]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from warnings import filterwarnings

### Data Preparation

In [32]:
data = pd.read_csv('../datasets/MIB/mib_processed_text_standardized.csv')
X_labels = list(data.columns)
Y_label = 'identification'

# use all except identification for inputs
X = data.drop(columns=[Y_label])
y = data[Y_label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### Classifier Initialization

**Num estimators = 4**

**Random forest, multilayer perceptron, naive bayes, logistic regression**

In [33]:
rf = RandomForestClassifier(n_estimators=5, max_depth=6, random_state=0)
mlp = MLPClassifier(solver='adam', hidden_layer_sizes=(20, 20), alpha=1e-4, activation='relu', max_iter=300, random_state=1)
gnb = GaussianNB()
lr = LogisticRegression(solver='liblinear', penalty='l1', random_state=0)

### Voting Classifier

**Voting: hard**

**Random forest, multilayer perceptron, naive bayes**

In [34]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_hard = VotingClassifier(voting='hard', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('naive_bayes', gnb)
])

vc_hard_pred = vc_hard.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_hard_pred, y_test)

0.9859108781127129

In [35]:
print(classification_report(vc_hard_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2041
       human       0.98      0.98      0.98      1011

    accuracy                           0.99      3052
   macro avg       0.98      0.98      0.98      3052
weighted avg       0.99      0.99      0.99      3052



### Voting Classifier

**Voting: hard**

**Random forest, multilayer perceptron, logistic regression**

In [36]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_hard = VotingClassifier(voting='hard', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('logistic_regression', lr)
])

vc_hard_pred = vc_hard.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_hard_pred, y_test)

0.9832896461336829

In [37]:
print(classification_report(vc_hard_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2051
       human       0.97      0.98      0.97      1001

    accuracy                           0.98      3052
   macro avg       0.98      0.98      0.98      3052
weighted avg       0.98      0.98      0.98      3052



### Voting Classifier

**Voting: hard**

**Random forest, multilayer perceptron, naive bayes, logistic regression**

In [38]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_hard = VotingClassifier(voting='hard', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('naive_bayes', gnb),
    ('logistic_regression', lr)
])

vc_hard_pred = vc_hard.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_hard_pred, y_test)

0.9819790301441678

In [39]:
print(classification_report(vc_hard_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.98      0.99      2063
       human       0.96      0.98      0.97       989

    accuracy                           0.98      3052
   macro avg       0.98      0.98      0.98      3052
weighted avg       0.98      0.98      0.98      3052



### Voting Classifier

**Voting: soft**

**Random forest, multilayer perceptron, naive bayes**

In [40]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_soft = VotingClassifier(voting='soft', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('naive_bayes', gnb)
])

vc_soft_pred = vc_soft.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_soft_pred, y_test)

0.9875491480996068

In [41]:
print(classification_report(vc_soft_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2038
       human       0.99      0.98      0.98      1014

    accuracy                           0.99      3052
   macro avg       0.99      0.98      0.99      3052
weighted avg       0.99      0.99      0.99      3052



### Voting Classifier

**Voting: soft**

**Random forest, multilayer perceptron, logistic regression**

In [42]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_soft = VotingClassifier(voting='soft', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('logistic_regression', lr)
])

vc_soft_pred = vc_soft.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_soft_pred, y_test)

0.9839449541284404

In [43]:
print(classification_report(vc_soft_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2051
       human       0.97      0.98      0.98      1001

    accuracy                           0.98      3052
   macro avg       0.98      0.98      0.98      3052
weighted avg       0.98      0.98      0.98      3052



### Voting Classifier

**Voting: soft**

**Random forest, multilayer perceptron, naive bayes, logistic regression**

In [44]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_soft = VotingClassifier(voting='soft', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('naive_bayes', gnb),
    ('logistic_regression', lr)
])

vc_soft_pred = vc_soft.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_soft_pred, y_test)

0.981651376146789

In [45]:
print(classification_report(vc_soft_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2042
       human       0.97      0.97      0.97      1010

    accuracy                           0.98      3052
   macro avg       0.98      0.98      0.98      3052
weighted avg       0.98      0.98      0.98      3052

