## Notebook: voting_classifier.ipynb

This notebook is used for building a voting classifier from the best performing version of the other models.

In [53]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from warnings import filterwarnings

### Data Preparation

In [54]:
X_labels = ['geo_enabled','default_profile','default_profile_image','followers_count','friends_count','favourites_count','listed_count','retweet_post_percent','reply_post_percent','avg_hashtags','avg_urls','avg_mentions','avg_retweets_cnt','avg_reply_cnt']
Y_label = 'identification'

data = pd.read_csv('../datasets/MIB/mib_processed.csv')
X = data[X_labels]
y = data[Y_label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### Classifier Initialization

**Num estimators = 4**

**Random forest, multilayer perceptron, naive bayes, logistic regression**

In [55]:
rf = RandomForestClassifier(n_estimators=5, max_depth=6, random_state=0)
mlp = MLPClassifier(solver='adam', hidden_layer_sizes=(20, 20), alpha=1e-4, activation='relu', max_iter=300, random_state=1)
gnb = GaussianNB()
lr = LogisticRegression(solver='liblinear', penalty='l1', random_state=0)

### Voting Classifier

**Voting: hard**

**Random forest, multilayer perceptron, naive bayes**

In [56]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_hard = VotingClassifier(voting='hard', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('naive_bayes', gnb)
])

vc_hard_pred = vc_hard.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_hard_pred, y_test)

0.9845272206303725

In [57]:
print(classification_report(vc_hard_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2488
       human       0.97      0.97      0.97      1002

    accuracy                           0.98      3490
   macro avg       0.98      0.98      0.98      3490
weighted avg       0.98      0.98      0.98      3490



### Voting Classifier

**Voting: hard**

**Random forest, multilayer perceptron, logistic regression**

In [58]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_hard = VotingClassifier(voting='hard', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('logistic_regression', lr)
])

vc_hard_pred = vc_hard.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_hard_pred, y_test)

0.9865329512893983

In [59]:
print(classification_report(vc_hard_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2487
       human       0.98      0.98      0.98      1003

    accuracy                           0.99      3490
   macro avg       0.98      0.98      0.98      3490
weighted avg       0.99      0.99      0.99      3490



### Voting Classifier

**Voting: hard**

**Random forest, multilayer perceptron, naive bayes, logistic regression**

In [60]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_hard = VotingClassifier(voting='hard', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('naive_bayes', gnb),
    ('logistic_regression', lr)
])

vc_hard_pred = vc_hard.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_hard_pred, y_test)

0.9696275071633238

In [61]:
print(classification_report(vc_hard_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.97      0.98      2556
       human       0.91      0.98      0.95       934

    accuracy                           0.97      3490
   macro avg       0.95      0.97      0.96      3490
weighted avg       0.97      0.97      0.97      3490



### Voting Classifier

**Voting: soft**

**Random forest, multilayer perceptron, naive bayes**

In [62]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_soft = VotingClassifier(voting='soft', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('naive_bayes', gnb)
])

vc_soft_pred = vc_soft.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_soft_pred, y_test)

0.9828080229226361

In [63]:
print(classification_report(vc_soft_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2488
       human       0.97      0.97      0.97      1002

    accuracy                           0.98      3490
   macro avg       0.98      0.98      0.98      3490
weighted avg       0.98      0.98      0.98      3490



### Voting Classifier

**Voting: soft**

**Random forest, multilayer perceptron, logistic regression**

In [64]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_soft = VotingClassifier(voting='soft', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('logistic_regression', lr)
])

vc_soft_pred = vc_soft.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_soft_pred, y_test)

0.9862464183381089

In [65]:
print(classification_report(vc_soft_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2486
       human       0.98      0.98      0.98      1004

    accuracy                           0.99      3490
   macro avg       0.98      0.98      0.98      3490
weighted avg       0.99      0.99      0.99      3490



### Voting Classifier

**Voting: soft**

**Random forest, multilayer perceptron, naive bayes, logistic regression**

In [66]:
# ignores ConvergenceWarning in MLPClassifier.fit()
filterwarnings('ignore')

vc_soft = VotingClassifier(voting='soft', estimators=[
    ('random_forest', rf),
    ('multilayer_perceptron', mlp),
    ('naive_bayes', gnb),
    ('logistic_regression', lr)
])

vc_soft_pred = vc_soft.fit(X_train, y_train).predict(X_test)
accuracy_score(vc_soft_pred, y_test)

0.979083094555874

In [67]:
print(classification_report(vc_soft_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.98      0.99      2497
       human       0.96      0.97      0.96       993

    accuracy                           0.98      3490
   macro avg       0.97      0.98      0.97      3490
weighted avg       0.98      0.98      0.98      3490

