# DecisionTrees
This notebook will contain my attempts using DecisionTrees on both of eBoss and ManGa datasets.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score
from sklearn.metrics import accuracy_score, auc, roc_auc_score, make_scorer
from sklearn.tree import DecisionTreeClassifier

## eBoss Data

Reading in the eboss data

In [3]:
raw_data_eBoss = pd.read_csv("../../../Data/Astronomy.csv")

Taking out the id column

In [4]:
trainable_data = raw_data_eBoss.iloc[:, 1:].copy(deep=True)

Seperating the X, and Y variables, and doing needed preprocessing.

In [5]:
X = trainable_data.iloc[:, :-1].copy(deep=True)
Y = trainable_data.iloc[:, -1:].copy(deep=True)
X = X.apply(pd.to_numeric, args={'errors': 'coerce'})
X = X.fillna('0')
Y = Y.Hits.map({'bad': 0, 'good': 1})

ss = MinMaxScaler()
X_scaled = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
X = X_scaled

In [6]:
# Defining all the scoring metrics that we need to keep track of.
scoring_dict = {
    'precision (no)': make_scorer(precision_score, pos_label=0),
    'precision (yes)': make_scorer(precision_score, pos_label=1),
    'recall (no)': make_scorer(recall_score, pos_label=0),
    'recall (yes)': make_scorer(recall_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(auc),
    'roc_auc': make_scorer(roc_auc_score),
    'f1': make_scorer(f1_score),
}

# Setting up the params.
params_grid = {'max_depth': np.arange(3,11), }

# Running the cross-validation with Decision Tree Classifier with 10 folds.
grid_clf = GridSearchCV(DecisionTreeClassifier(),
                        cv=KFold(n_splits=10, shuffle=True),
                        param_grid=params_grid,
                        scoring=scoring_dict,
                        refit='accuracy', 
                        return_train_score=False).fit(X,Y)

# Extracting relevant metrics. More detailed metrics are available in the cv_results_ object.
relevant_metric = {'mean_test_{}'.format(k): grid_clf.cv_results_['mean_test_{}'.format(k)] for k in scoring_dict.keys()}
relevant_metric['params'] = grid_clf.cv_results_['params']
relevant_metric

{'mean_test_precision (no)': array([0.90455942, 0.79920198, 0.78779868, 0.77067164, 0.84114644,
        0.79457241, 0.8299054 , 0.78938316]),
 'mean_test_precision (yes)': array([0.92147957, 0.9080669 , 0.91608067, 0.91645221, 0.93145723,
        0.92151994, 0.93058923, 0.91519533]),
 'mean_test_recall (no)': array([0.72994495, 0.7500441 , 0.77763786, 0.75996289, 0.80250327,
        0.77035618, 0.80633878, 0.75042735]),
 'mean_test_recall (yes)': array([0.96398555, 0.93840769, 0.93256245, 0.93307821, 0.95106951,
        0.93183089, 0.94121581, 0.91939863]),
 'mean_test_accuracy': array([0.91459075, 0.886121  , 0.88967972, 0.88967972, 0.91459075,
        0.89323843, 0.90747331, 0.88256228]),
 'mean_test_auc': array([0.60142349, 0.60142349, 0.60142349, 0.65124555, 0.60142349,
        0.65124555, 0.60142349, 0.65124555]),
 'mean_test_roc_auc': array([0.84696525, 0.8442259 , 0.85510016, 0.84652055, 0.87678639,
        0.85109353, 0.87377729, 0.83491299]),
 'mean_test_f1': array([0.94086562

In [7]:
for k,v  in relevant_metric.items():
    print(k, ": ", v[4])

mean_test_precision (no) :  0.8411464375877188
mean_test_precision (yes) :  0.9314572338037082
mean_test_recall (no) :  0.8025032697630562
mean_test_recall (yes) :  0.9510695067650043
mean_test_accuracy :  0.9145907473309609
mean_test_auc :  0.6014234875444839
mean_test_roc_auc :  0.8767863882640302
mean_test_f1 :  0.9402871914402844
params :  {'max_depth': 7}


In [8]:
grid_clf.best_score_, grid_clf.best_params_

(0.9145907473309609, {'max_depth': 3})

# Manga Data:

In [9]:
raw_data_Manga = pd.read_csv("../../../Data/Astronomy20000_Original.csv")

Seperating the X, and Y variables, and doing needed preprocessing.

In [19]:
X = raw_data_Manga.iloc[:, 1:-1].copy(deep=True)
Y = raw_data_Manga.iloc[:, [-1]].copy(deep=True)
X = X.fillna('0')
Y = Y.Hits.map({'bad': 0, 'good': 1})

Making model and testing it.

In [21]:
# Defining all the scoring metrics that we need to keep track of.
scoring_dict = {
    'precision (no)': make_scorer(precision_score, pos_label=0),
    'precision (yes)': make_scorer(precision_score, pos_label=1),
    'recall (no)': make_scorer(recall_score, pos_label=0),
    'recall (yes)': make_scorer(recall_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(auc, reorder=True),
    'roc_auc': make_scorer(roc_auc_score),
    'f1': make_scorer(f1_score),
}

# Setting up the params.
params_grid = {'max_depth': np.arange(3,11)}

# Running the cross-validation 10 folds with decision tree classifier.
grid_clf = GridSearchCV(DecisionTreeClassifier(),
                        cv=KFold(n_splits=10, shuffle=True),
                        param_grid=params_grid,
                        scoring=scoring_dict,
                        refit='accuracy', 
                        return_train_score=False)
grid_clf.fit(X,Y)

# Extracting relevant metrics. More detailed metrics are available in the cv_results_ object.
relevant_metric = {'mean_test_{}'.format(k): grid_clf.cv_results_['mean_test_{}'.format(k)] for k in scoring_dict.keys()}
relevant_metric['params'] = grid_clf.cv_results_['params']
relevant_metric

{'mean_test_precision (no)': array([0.83621447, 0.83507406, 0.89611522, 0.89566532, 0.87351041,
        0.90203714, 0.89819943, 0.9007457 ]),
 'mean_test_precision (yes)': array([0.87029168, 0.92518724, 0.71219441, 0.73967092, 0.81737796,
        0.75327824, 0.79081341, 0.75890847]),
 'mean_test_recall (no)': array([0.97899611, 0.98879642, 0.90777904, 0.9203057 , 0.955049  ,
        0.92353713, 0.94012361, 0.92648146]),
 'mean_test_recall (yes)': array([0.42407515, 0.4134694 , 0.68394222, 0.67828809, 0.58316125,
        0.69884722, 0.67994655, 0.69357227]),
 'mean_test_accuracy': array([0.8403713 , 0.84508465, 0.85181801, 0.85970566, 0.8623509 ,
        0.86744902, 0.87514429, 0.86821855]),
 'mean_test_auc': array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]),
 'mean_test_roc_auc': array([0.70153563, 0.70113291, 0.79586063, 0.7992969 , 0.76910512,
        0.81119218, 0.81003508, 0.81002686]),
 'mean_test_f1': array([0.56992943, 0.57111448, 0.69729296, 0.70700758, 0.67826016,
        0.7248