In [97]:
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree   import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [125]:
data_path = "./data"
x_train = pd.read_csv(f'{data_path}/x_train.csv', index_col=0)
y_train = pd.read_csv(f'{data_path}/y_train.csv', index_col=0)
x_test = pd.read_csv(f'{data_path}/x_test.csv',index_col=0)
y_test = pd.read_csv(f'{data_path}/y_test_baseline.csv',index_col=0)

In [126]:
print("x_train: ",x_train.shape)
print("y_train: ",y_train.shape)
print("x_test: ",x_test.shape)
print("y_test_baseline: ",y_test.shape)

x_train:  (406708, 54)
y_train:  (406708, 1)
x_test:  (174304, 54)
y_test_baseline:  (174304, 1)


In [155]:
X_train = copy.deepcopy(x_train)
Y_train = copy.deepcopy(y_train)
X_test = copy.deepcopy(x_test)
Y_test = copy.deepcopy(y_test)

Remove index and column names

In [142]:
X_train = X_train.values[0:,:]
Y_train = y_train.values[:,0]
X_test = X_test.values[0:,:]
Y_test = Y_test.values[:,0]

In [145]:
# After performing copy with dtype = int64
print("x_train: ",X_train.shape)
print("y_train: ",Y_train.shape)
print("x_test: ",X_test.shape)
print("y_test_baseline: ",Y_test.shape)

x_train:  (406708, 54)
y_train:  (406708,)
x_test:  (174304, 54)
y_test_baseline:  (174304,)


Verify all have the same format

In [149]:
print(X_train.dtype)
print(Y_train.dtype)
print(X_test.dtype)
print(Y_test.dtype)

int64
int64
int64
int64


#### Models

In [150]:
def train_using_gini(X_train, y_train):
    # Gini index will prefer attributes where entropy is lower
    # Gini Index is a metric to measure how often a randomly
    # chosen element would be incorrectly identified.
    # Classifier obj
    clf_gini = DecisionTreeClassifier(criterion="gini",
        random_state=100, max_depth=3, min_samples_leaf=5)

    #train
    clf_gini.fit(X_train, y_train)
    return clf_gini
def train_using_entropy(X_train, y_train):
    clf_entropy = DecisionTreeClassifier(
        criterion="entropy", random_state=100,
        max_depth=3, min_samples_leaf=5)
    clf_entropy.fit(X_train, y_train)
    return clf_entropy

In [151]:

def prediction(X_test, clf_object):
    y_pred = clf_object.predict(X_test)
    print("Predicted values: ")
    print(y_pred)
    return y_pred
def calculate_Acc(y_test, y_pred):
    print("Confusion matrix: \n")
    confusion_matrix(y_test, y_pred)
    print("Accuracy: ", accuracy_score(y_test,y_pred)*100)
    print("Report ", classification_report(y_test, y_pred))

In [152]:
clf_gini = train_using_gini(X_train, Y_train)
clf_entropy = train_using_entropy(X_train, Y_train)

###  1. Models using all features as default:
Without any type of analysis or feature selection.

In [154]:
# Operational Phase 
print("Results Using Gini Index:") 
# Prediction using gini 
y_pred_gini = prediction(X_test, clf_gini)
calculate_Acc(Y_test, y_pred_gini)
print("Results using Entropy: ")
y_entr = prediction(X_test, clf_entropy)
calculate_Acc(Y_test, y_entr)

Results Using Gini Index:
Predicted values: 
[2 1 2 ... 7 2 2]
Confusion matrix: 

Accuracy:  48.689645676519184
Report                precision    recall  f1-score   support

           1       0.80      0.55      0.66     99239
           2       0.17      0.88      0.28     16354
           3       0.80      0.60      0.69     19330
           4       0.44      0.13      0.21      3080
           5       0.00      0.00      0.00     21728
           6       0.00      0.00      0.00      1250
           7       0.68      0.26      0.38     13323

    accuracy                           0.49    174304
   macro avg       0.41      0.35      0.32    174304
weighted avg       0.62      0.49      0.51    174304

Results using Entropy: 
Predicted values: 
[2 1 2 ... 1 2 2]
Confusion matrix: 

Accuracy:  44.701211676152006
Report                precision    recall  f1-score   support

           1       0.77      0.55      0.64     99239
           2       0.16      0.88      0.27     16354


# 2. Stacking Scikit-Learn API

In [None]:
# Checking  scikit-learn version
import sklearn
print(sklearn.__version__)