In [None]:
import numpy as np
import pandas as pd
import competition_helpers
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# I/O configuration here
X_train = competition_helpers.read_csv("train_features.csv")
y_train = competition_helpers.read_csv("train_label.csv", remove_header=True)
X_test = competition_helpers.read_csv("test_features.csv")
submission_col = np.array(pd.read_csv("test_features.csv", header=None).iloc[: , 0]).ravel()
submission_file_name = "results/decisiontree_default_submission.csv"

print(X_train.shape, y_train.shape, X_test.shape)

In [None]:
# 5 fold cross validation
train_test_split = competition_helpers.kfold_stratified_split(X_train, y_train, 5, False)
# With standardization
standardized_train_test_split = competition_helpers.kfold_stratified_split(X_train, y_train, 5, True)

In [None]:
# 5 fold train test split results

for max_depth in [None, 2, 4]:
    for split_ in [2, 5, 15]:
        for leaf_ in [1, 10, 30, 60]:
#             for num_features in [None, 90]:
                results = []
                for [(X_train_cv, y_train_cv), (X_test_cv, y_test_cv)] in train_test_split: # standardized_train_test_split

                    clf = tree.DecisionTreeClassifier(random_state=42, max_depth=max_depth, \
#                                                       max_features=num_features, \
                                                      min_samples_split=split_, min_samples_leaf=leaf_)
                    clf.fit(X_train_cv, y_train_cv.ravel())  
                    prediction = clf.predict(X_test_cv)

                    accuracy = accuracy_score(y_test_cv.ravel(), prediction.ravel())
                    precision = precision_score(y_test_cv.ravel(), prediction.ravel())
                    recall = recall_score(y_test_cv.ravel(), prediction.ravel())
                    f1 = f1_score(y_test_cv.ravel(), prediction.ravel())

                    results.append([accuracy, precision, recall, f1])


                measures = np.sum(np.array(results), axis=0) / len(results) 
                print("Max depth: {} num_features:  min_samples_split: {} min_samples_leaf: {} measures: {}".format(max_depth, split_, leaf_, measures))


In [None]:
# fitting the test dataset
clf = tree.DecisionTreeClassifier(
    max_depth=None, min_samples_split=60, min_samples_leaf= 30
)
clf.fit(X_train, y_train.ravel())  
prediction = clf.predict(X_test)

In [None]:
pd.DataFrame({"id": submission_col, "label": prediction}).to_csv(submission_file_name, encoding='utf-8', index=False)