In [1]:
# do oversampling
import numpy as np
import pandas as pd
import competition_helpers
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

In [2]:
# I/O configuration here
X_train = competition_helpers.read_csv("train_features.csv")
y_train = competition_helpers.read_csv("train_label.csv", remove_header=True)
X_test = competition_helpers.read_csv("test_features.csv")
submission_col = np.array(pd.read_csv("test_features.csv", header=None).iloc[: , 0]).ravel()
submission_file_name = "results/voting_default_submission1.csv"

print(X_train.shape, y_train.shape, X_test.shape)
print(sorted(Counter(list(y_train.flatten())).items()))

X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train.ravel())
print(sorted(Counter(list(y_resampled.flatten())).items()))

(418, 100) (418, 1) (378, 100)
[(0, 107), (1, 311)]
[(0, 311), (1, 311)]


In [3]:
# 5 fold cross validation
# train_test_split = competition_helpers.kfold_stratified_split(X_train, y_train, 5,False)
# With standardization
standardized_train_test_split = competition_helpers.kfold_stratified_split(X_resampled, y_resampled.reshape((-1, 1)), 5,True)

In [4]:
# # 5 fold train test split results
# results = []
# for estimators_ in [50, 100, 150]:
#     for lr in [0.1, 0.5, 1, 5]:
#         for [(X_train_cv, y_train_cv), (X_test_cv, y_test_cv)] in train_test_split:

#             clf = AdaBoostClassifier(random_state=42,
#                                     base_estimator=tree.DecisionTreeClassifier(
#                                     max_depth=None, min_samples_split=60, min_samples_leaf= 30
#                                     ),
#                                      n_estimators=estimators_,
#                                      learning_rate=lr
#                                     )
#             clf.fit(X_train_cv, y_train_cv.ravel())  
#             prediction = clf.predict(X_test_cv)

#             accuracy = accuracy_score(y_test_cv.ravel(), prediction.ravel())
#             precision = precision_score(y_test_cv.ravel(), prediction.ravel())
#             recall = recall_score(y_test_cv.ravel(), prediction.ravel())
#             f1 = f1_score(y_test_cv.ravel(), prediction.ravel())

#             results.append([accuracy, precision, recall, f1])


#         measures = np.sum(np.array(results), axis=0) / len(results) 
#         print("n_estimators: {} learning rate: {} measures: {}".format(estimators_, lr, measures))

In [5]:
results = []
for [(X_train_cv, y_train_cv), (X_test_cv, y_test_cv)] in standardized_train_test_split:
    

    clf = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(), n_estimators=100)
    clf.fit(X_train_cv, y_train_cv.ravel())  
    prediction = clf.predict(X_test_cv)

    accuracy = accuracy_score(y_test_cv.ravel(), prediction.ravel())
    precision = precision_score(y_test_cv.ravel(), prediction.ravel())
    recall = recall_score(y_test_cv.ravel(), prediction.ravel())
    f1 = f1_score(y_test_cv.ravel(), prediction.ravel())

    results.append([accuracy, precision, recall, f1])


measures = np.sum(np.array(results), axis=0) / len(results) 

In [6]:
print(measures)

[0.8827701  0.87514141 0.89395801 0.884238  ]


In [7]:
# fitting the test dataset


clf = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(), n_estimators=100)

clf.fit(X_resampled, y_resampled.ravel())  
prediction = clf.predict(X_test)

In [8]:
pd.DataFrame({"id": submission_col, "label": prediction}).to_csv(submission_file_name, encoding='utf-8', index=False)