## Notebook: decision_tree.ipynb

This notebook is used for building decision tree and random forest classifiers.

In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### Data Preparation

In [3]:
X_labels = ['geo_enabled','default_profile','default_profile_image','followers_count','friends_count','favourites_count','listed_count','retweet_post_percent','reply_post_percent','avg_hashtags','avg_urls','avg_mentions','avg_retweets_cnt','avg_reply_cnt']
Y_label = 'identification'

data = pd.read_csv('../datasets/MIB/mib_processed.csv')
X = data[X_labels]
y = data[Y_label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### Decision Tree Classifier

**Max depth = 2**

In [4]:
decision_tree_2 = DecisionTreeClassifier(max_depth=2, random_state=0)
dt2_y_pred = decision_tree_2.fit(X_train, y_train).predict(X_test)
accuracy_score(dt2_y_pred, y_test)

0.9451169188445667

In [5]:
print(classification_report(dt2_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.96      0.97      0.97     18689
       human       0.83      0.78      0.80      3121

    accuracy                           0.95     21810
   macro avg       0.90      0.88      0.89     21810
weighted avg       0.94      0.95      0.94     21810



### Decision Tree Classifier

**Max depth = 6**

In [16]:
decision_tree_6 = DecisionTreeClassifier(max_depth=6, random_state=0)
dt6_y_pred = decision_tree_6.fit(X_train, y_train).predict(X_test)
accuracy_score(dt6_y_pred, y_test)

0.9570839064649244

In [17]:
print(classification_report(dt6_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.98      0.97      0.98     19114
       human       0.80      0.87      0.83      2696

    accuracy                           0.96     21810
   macro avg       0.89      0.92      0.90     21810
weighted avg       0.96      0.96      0.96     21810



### Random Forest Classifier

**Num estimators = 5**\
**Max depth = 4**

In [8]:
random_forest_5_4 = RandomForestClassifier(n_estimators=5, max_depth=4, random_state=0)
rf5_4_y_pred = random_forest_5_4.fit(X_train, y_train).predict(X_test)
accuracy_score(rf5_4_y_pred, y_test)

0.953232462173315

In [9]:
print(classification_report(rf5_4_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.97      0.97      0.97     18892
       human       0.82      0.83      0.83      2918

    accuracy                           0.95     21810
   macro avg       0.90      0.90      0.90     21810
weighted avg       0.95      0.95      0.95     21810



### Random Forest Classifier

**Num estimators = 25**\
**Max depth = 4**

In [10]:
random_forest_25_4 = RandomForestClassifier(n_estimators=25, max_depth=4, random_state=0)
rf25_4_y_pred = random_forest_25_4.fit(X_train, y_train).predict(X_test)
accuracy_score(rf25_4_y_pred, y_test)

0.9544245758826226

In [11]:
print(classification_report(rf25_4_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.97      0.97      0.97     18862
       human       0.83      0.83      0.83      2948

    accuracy                           0.95     21810
   macro avg       0.90      0.90      0.90     21810
weighted avg       0.95      0.95      0.95     21810



### Random Forest Classifier

**Num estimators = 5**\
**Max depth = 6**

In [12]:
random_forest_5_6 = RandomForestClassifier(n_estimators=5, max_depth=6, random_state=0)
rf5_6_y_pred = random_forest_5_6.fit(X_train, y_train).predict(X_test)
accuracy_score(rf5_6_y_pred, y_test)

0.9569922054103622

In [13]:
print(classification_report(rf5_6_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.98      0.97      0.98     18980
       human       0.82      0.85      0.84      2830

    accuracy                           0.96     21810
   macro avg       0.90      0.91      0.91     21810
weighted avg       0.96      0.96      0.96     21810

