## Notebook: decision_tree.ipynb

This notebook is used for building decision tree and random forest classifiers.

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### Data Preparation

In [2]:
X_labels = ['geo_enabled','default_profile','default_profile_image','followers_count','friends_count','favourites_count','listed_count','retweet_post_percent','reply_post_percent','avg_hashtags','avg_urls','avg_mentions','avg_retweets_cnt','avg_reply_cnt']
Y_label = 'identification'

data = pd.read_csv('../datasets/MIB/mib_processed.csv')
X = data[X_labels]
y = data[Y_label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

### Decision Tree Classifier

**Max depth = 5**

In [3]:
decision_tree = DecisionTreeClassifier(max_depth=5, random_state=0)
dt_y_pred = decision_tree.fit(X_train, y_train).predict(X_test)
accuracy_score(dt_y_pred, y_test)

0.984697386519945

In [4]:
print(classification_report(dt_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      4143
       human       0.97      0.98      0.97      1673

    accuracy                           0.98      5816
   macro avg       0.98      0.98      0.98      5816
weighted avg       0.98      0.98      0.98      5816



### Random Forest Classifier

**Num estimators = 20**\
**Max depth = 4**

In [5]:
random_forest = RandomForestClassifier(n_estimators=20, max_depth=4, random_state=0)
rf_y_pred = random_forest.fit(X_train, y_train).predict(X_test)
accuracy_score(rf_y_pred, y_test)

0.9862448418156808

In [6]:
print(classification_report(rf_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      4158
       human       0.97      0.98      0.98      1658

    accuracy                           0.99      5816
   macro avg       0.98      0.99      0.98      5816
weighted avg       0.99      0.99      0.99      5816

