In [1]:
import numpy as np
import pandas as pd
import competition_helpers
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# I/O configuration here
X_train = competition_helpers.read_csv("train_features.csv")
y_train = competition_helpers.read_csv("train_label.csv", remove_header=True)
X_test = competition_helpers.read_csv("test_features.csv")
submission_col = np.array(pd.read_csv("test_features.csv", header=None).iloc[: , 0]).ravel()
submission_file_name = "results/decisiontree_default_submission.csv"

print(X_train.shape, y_train.shape, X_test.shape)

(418, 100) (418, 1) (378, 100)


In [3]:
# 5 fold cross validation
train_test_split = competition_helpers.kfold_stratified_split(X_train, y_train, 5, False)
# With standardization
standardized_train_test_split = competition_helpers.kfold_stratified_split(X_train, y_train, 5, True)

In [4]:
# 5 fold train test split results

for max_depth in [None, 2, 4]:
    for split_ in [2, 15]:
        for leaf_ in [1, 10]:
            for num_features in [None, 90]:
                results = []
                for [(X_train_cv, y_train_cv), (X_test_cv, y_test_cv)] in train_test_split: # standardized_train_test_split

                    clf = tree.DecisionTreeClassifier(random_state=42, max_depth=max_depth, \
                                                      max_features=num_features, \
                                                      min_samples_split=split_, min_samples_leaf=leaf_)
                    clf.fit(X_train_cv, y_train_cv.ravel())  
                    prediction = clf.predict(X_test_cv)

                    accuracy = accuracy_score(y_test_cv.ravel(), prediction.ravel())
                    precision = precision_score(y_test_cv.ravel(), prediction.ravel())
                    recall = recall_score(y_test_cv.ravel(), prediction.ravel())
                    f1 = f1_score(y_test_cv.ravel(), prediction.ravel())

                    results.append([accuracy, precision, recall, f1])


                measures = np.sum(np.array(results), axis=0) / len(results) 
                print("Max depth: {} num_features: {} min_samples_split: {} min_samples_leaf: {} measures: {}".format(max_depth, num_features, split_, leaf_, measures))


Max depth: None num_features: None min_samples_split: 2 min_samples_leaf: 1 measures: [0.746314   0.83858283 0.81669227 0.82713389]
Max depth: None num_features: 90 min_samples_split: 2 min_samples_leaf: 1 measures: [0.75110526 0.85064589 0.80701485 0.82802816]
Max depth: None num_features: None min_samples_split: 2 min_samples_leaf: 10 measures: [0.73429449 0.82529751 0.81643625 0.81933722]
Max depth: None num_features: 90 min_samples_split: 2 min_samples_leaf: 10 measures: [0.73435119 0.82728079 0.81326165 0.81815172]
Max depth: None num_features: None min_samples_split: 15 min_samples_leaf: 1 measures: [0.7223877  0.83707447 0.778085   0.80581244]
Max depth: None num_features: 90 min_samples_split: 15 min_samples_leaf: 1 measures: [0.71992137 0.83649481 0.77485919 0.80407092]
Max depth: None num_features: None min_samples_split: 15 min_samples_leaf: 10 measures: [0.73429449 0.82529751 0.81643625 0.81933722]
Max depth: None num_features: 90 min_samples_split: 15 min_samples_leaf: 10 

## Without standardization:

### gini, max_depth, min_samples_split:


Max depth: None num_features: None min_samples_split: 2 min_samples_leaf: 1 measures: [0.746314   0.83858283 0.81669227 0.82713389]

Max depth: None num_features: 90 min_samples_split: 2 min_samples_leaf: 1 measures: [0.75110526 0.85064589 0.80701485 0.82802816]

Max depth: None num_features: None min_samples_split: 2 min_samples_leaf: 10 measures: [0.73429449 0.82529751 0.81643625 0.81933722]

Max depth: None num_features: 90 min_samples_split: 2 min_samples_leaf: 10 measures: [0.73435119 0.82728079 0.81326165 0.81815172]

Max depth: None num_features: None min_samples_split: 15 min_samples_leaf: 1 measures: [0.7223877  0.83707447 0.778085   0.80581244]

Max depth: None num_features: 90 min_samples_split: 15 min_samples_leaf: 1 measures: [0.71992137 0.83649481 0.77485919 0.80407092]

Max depth: None num_features: None min_samples_split: 15 min_samples_leaf: 10 measures: [0.73429449 0.82529751 0.81643625 0.81933722]

Max depth: None num_features: 90 min_samples_split: 15 min_samples_leaf: 10 measures: [0.73435119 0.82728079 0.81326165 0.81815172]

Max depth: 2 num_features: None min_samples_split: 2 min_samples_leaf: 1 measures: [0.76556107 0.83297441 0.85867896 0.8430515 ]

Max depth: 2 num_features: 90 min_samples_split: 2 min_samples_leaf: 1 measures: [0.76556107 0.83654797 0.85222734 0.84228248]

Max depth: 2 num_features: None min_samples_split: 2 min_samples_leaf: 10 measures: [0.76556107 0.82808009 0.86513057 0.84401924]

Max depth: 2 num_features: 90 min_samples_split: 2 min_samples_leaf: 10 measures: [0.76556107 0.83165364 0.85867896 0.84325022]

Max depth: 2 num_features: None min_samples_split: 15 min_samples_leaf: 1 measures: [0.76556107 0.83297441 0.85867896 0.8430515 ]

Max depth: 2 num_features: 90 min_samples_split: 15 min_samples_leaf: 1 measures: [0.76556107 0.83654797 0.85222734 0.84228248]

Max depth: 2 num_features: None min_samples_split: 15 min_samples_leaf: 10 measures: [0.76556107 0.82808009 0.86513057 0.84401924]

Max depth: 2 num_features: 90 min_samples_split: 15 min_samples_leaf: 10 measures: [0.76556107 0.83165364 0.85867896 0.84325022]

Max depth: 4 num_features: None min_samples_split: 2 min_samples_leaf: 1 measures: [0.72717897 0.82320011 0.80691244 0.81357466]

Max depth: 4 num_features: 90 min_samples_split: 2 min_samples_leaf: 1 measures: [0.73917046 0.83005092 0.81658986 0.82215389]

Max depth: 4 num_features: None min_samples_split: 2 min_samples_leaf: 10 measures: [0.73670413 0.82604434 0.81966206 0.82115539]

Max depth: 4 num_features: 90 min_samples_split: 2 min_samples_leaf: 10 measures: [0.73429449 0.82329447 0.81966206 0.81979164]

Max depth: 4 num_features: None min_samples_split: 15 min_samples_leaf: 1 measures: [0.72471263 0.82427001 0.80046083 0.8106881 ]

Max depth: 4 num_features: 90 min_samples_split: 15 min_samples_leaf: 1 measures: [0.73917046 0.83059627 0.81658986 0.82284239]

Max depth: 4 num_features: None min_samples_split: 15 min_samples_leaf: 10 measures: [0.73670413 0.82604434 0.81966206 0.82115539]

Max depth: 4 num_features: 90 min_samples_split: 15 min_samples_leaf: 10 measures: [0.73429449 0.82329447 0.81966206 0.81979164]

## With standardization



In [5]:
# fitting the test dataset
# clf = tree.DecisionTreeClassifier()
# clf.fit(X_train, y_train.ravel())  
# prediction = clf.predict(X_test)

In [6]:
# pd.DataFrame({"id": submission_col, "label": prediction}).to_csv(submission_file_name, encoding='utf-8', index=False)