## Notebook: decision_tree.ipynb

This notebook is used for building decision tree and random forest classifiers.

In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### Data Preparation

In [9]:
X_labels = ['geo_enabled','default_profile','default_profile_image','followers_count','friends_count','favourites_count','listed_count','retweet_post_percent','reply_post_percent','avg_hashtags','avg_urls','avg_mentions','avg_retweets_cnt','avg_reply_cnt']
Y_label = 'identification'

data = pd.read_csv('../datasets/MIB/mib_processed.csv')
X = data[X_labels]
y = data[Y_label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### Decision Tree Classifier

**Max depth = 2**

In [10]:
decision_tree_2 = DecisionTreeClassifier(max_depth=2, random_state=0)
dt2_y_pred = decision_tree_2.fit(X_train, y_train).predict(X_test)
accuracy_score(dt2_y_pred, y_test)

0.9796561604584527

In [12]:
print(classification_report(dt2_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.98      0.99      2517
       human       0.95      0.98      0.96       973

    accuracy                           0.98      3490
   macro avg       0.97      0.98      0.97      3490
weighted avg       0.98      0.98      0.98      3490



### Decision Tree Classifier

**Max depth = 6**

In [13]:
decision_tree_6 = DecisionTreeClassifier(max_depth=6, random_state=0)
dt6_y_pred = decision_tree_6.fit(X_train, y_train).predict(X_test)
accuracy_score(dt6_y_pred, y_test)

0.9848137535816619

In [14]:
print(classification_report(dt6_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2491
       human       0.97      0.98      0.97       999

    accuracy                           0.98      3490
   macro avg       0.98      0.98      0.98      3490
weighted avg       0.98      0.98      0.98      3490



### Random Forest Classifier

**Num estimators = 5**\
**Max depth = 4**

In [15]:
random_forest_5_4 = RandomForestClassifier(n_estimators=5, max_depth=4, random_state=0)
rf5_4_y_pred = random_forest_5_4.fit(X_train, y_train).predict(X_test)
accuracy_score(rf5_4_y_pred, y_test)

0.9871060171919771

In [16]:
print(classification_report(rf5_4_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2479
       human       0.98      0.97      0.98      1011

    accuracy                           0.99      3490
   macro avg       0.99      0.98      0.98      3490
weighted avg       0.99      0.99      0.99      3490



### Random Forest Classifier

**Num estimators = 25** \
**Max depth = 4**

In [17]:
random_forest_25_4 = RandomForestClassifier(n_estimators=25, max_depth=4, random_state=0)
rf25_4_y_pred = random_forest_25_4.fit(X_train, y_train).predict(X_test)
accuracy_score(rf25_4_y_pred, y_test)

0.9871060171919771

In [18]:
print(classification_report(rf25_4_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2489
       human       0.98      0.98      0.98      1001

    accuracy                           0.99      3490
   macro avg       0.98      0.98      0.98      3490
weighted avg       0.99      0.99      0.99      3490



### Random Forest Classifier

**Num estimators = 5**\
**Max depth = 6**

In [19]:
random_forest_5_6 = RandomForestClassifier(n_estimators=5, max_depth=6, random_state=0)
rf5_6_y_pred = random_forest_5_6.fit(X_train, y_train).predict(X_test)
accuracy_score(rf5_6_y_pred, y_test)

0.9871060171919771

In [20]:
print(classification_report(rf5_6_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2489
       human       0.98      0.98      0.98      1001

    accuracy                           0.99      3490
   macro avg       0.98      0.98      0.98      3490
weighted avg       0.99      0.99      0.99      3490

