## Notebook: decision_tree.ipynb

This notebook is used for building decision tree and random forest classifiers.

In [33]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### Data Preparation

In [34]:
data = pd.read_csv('../datasets/MIB/mib_processed_text_standardized.csv')
X_labels = list(data.columns)
Y_label = 'identification'

# use all except identification for inputs
X = data.drop(columns=[Y_label])
y = data[Y_label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### Decision Tree Classifier

**Max depth = 2**

In [35]:
decision_tree_2 = DecisionTreeClassifier(max_depth=2, random_state=0)
dt2_y_pred = decision_tree_2.fit(X_train, y_train).predict(X_test)
accuracy_score(dt2_y_pred, y_test)

0.9744429882044561

In [36]:
print(classification_report(dt2_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.97      0.98      2076
       human       0.95      0.98      0.96       976

    accuracy                           0.97      3052
   macro avg       0.97      0.97      0.97      3052
weighted avg       0.97      0.97      0.97      3052



### Decision Tree Classifier

**Max depth = 6**

In [37]:
decision_tree_6 = DecisionTreeClassifier(max_depth=6, random_state=0)
dt6_y_pred = decision_tree_6.fit(X_train, y_train).predict(X_test)
accuracy_score(dt6_y_pred, y_test)

0.9790301441677588

In [38]:
print(classification_report(dt6_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.98      0.98      2052
       human       0.97      0.97      0.97      1000

    accuracy                           0.98      3052
   macro avg       0.98      0.98      0.98      3052
weighted avg       0.98      0.98      0.98      3052



### Random Forest Classifier

**Num estimators = 5**\
**Max depth = 4**

In [39]:
random_forest_5_4 = RandomForestClassifier(n_estimators=5, max_depth=4, random_state=0)
rf5_4_y_pred = random_forest_5_4.fit(X_train, y_train).predict(X_test)
accuracy_score(rf5_4_y_pred, y_test)

0.9836173001310616

In [40]:
print(classification_report(rf5_4_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2044
       human       0.98      0.97      0.98      1008

    accuracy                           0.98      3052
   macro avg       0.98      0.98      0.98      3052
weighted avg       0.98      0.98      0.98      3052



### Random Forest Classifier

**Num estimators = 25** \
**Max depth = 4**

In [41]:
random_forest_25_4 = RandomForestClassifier(n_estimators=25, max_depth=4, random_state=0)
rf25_4_y_pred = random_forest_25_4.fit(X_train, y_train).predict(X_test)
accuracy_score(rf25_4_y_pred, y_test)

0.9836173001310616

In [42]:
print(classification_report(rf25_4_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2038
       human       0.98      0.97      0.98      1014

    accuracy                           0.98      3052
   macro avg       0.98      0.98      0.98      3052
weighted avg       0.98      0.98      0.98      3052



### Random Forest Classifier

**Num estimators = 5**\
**Max depth = 6**

In [43]:
random_forest_5_6 = RandomForestClassifier(n_estimators=5, max_depth=6, random_state=0)
rf5_6_y_pred = random_forest_5_6.fit(X_train, y_train).predict(X_test)
accuracy_score(rf5_6_y_pred, y_test)

0.984600262123198

In [44]:
print(classification_report(rf5_6_y_pred, y_test))

              precision    recall  f1-score   support

         bot       0.99      0.99      0.99      2049
       human       0.98      0.98      0.98      1003

    accuracy                           0.98      3052
   macro avg       0.98      0.98      0.98      3052
weighted avg       0.98      0.98      0.98      3052

