In [135]:
# Used in parsing data and creating decision tree
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Used in visualizing the decision tree
from sklearn import tree
from matplotlib import pyplot as plt

# Used in hyperparameter optimization (GridSearch)
from sklearn.model_selection import GridSearchCV

In [136]:
# List of file suffixes
dataFileKey = {1: "c300_d100", 
               2: "c300_d1000",
               3: "c300_d5000",
               4: "c500_d100",
               5: "c500_d1000",
               6: "c500_d5000",
               7: "c1000_d100",
               8: "c1000_d1000",
               9: "c1000_d5000",
               10: "c1500_d100",
               11: "c1500_d1000",
               12: "c1500_d5000",
               13: "c1800_d100",
               14: "c1800_d1000",
               15: "c1800_d5000"}

# File suffix key number
dataFileKeyNum:int = 1

The two section below are responsible for loading the data and splitting it into its approipate variables

In [137]:
# Load data sets, all from the same grouping

# Training data
filePath = "..\project2_data\\train_" + dataFileKey[dataFileKeyNum] + ".csv"
trainData = pd.read_csv(filePath, header = None)

# Test data
filePath = "..\project2_data\\test_" + dataFileKey[dataFileKeyNum] + ".csv"
testData = pd.read_csv(filePath, header = None)

# Validation data
filePath = "..\project2_data\\valid_" + dataFileKey[dataFileKeyNum] + ".csv"
validData = pd.read_csv(filePath, header = None)

# trainData.head(10)
# testData.head(10)


In [138]:
# Split data sets into features and classes

# Training Data Split
trainX = trainData.drop(columns=[trainData.columns[-1]])
trainY = trainData.iloc[:, -1:]

# Testing Data Split
testX = testData.drop(columns=[testData.columns[-1]])
testY = testData.iloc[:, -1:]

# Validation Data Split
validX = validData.drop(columns=[validData.columns[-1]])
validY = validData.iloc[:, -1:]

# Combine train and validation data
combineX = pd.concat([trainX, validX], ignore_index=True)
combineY = pd.concat([trainY, validY], ignore_index=True)

The code below will create, run, and test the basic decision tree classifier

In [139]:
# Create decision tree
decisionTree = DecisionTreeClassifier()


# Train tree on data
decisionTree = decisionTree.fit(trainX, trainY)


# Test decision tree
predictions = decisionTree.predict(validX)


# Print accuracy
print("Validation Data Metrics")
print("Accuracy: ", metrics.accuracy_score(validY, predictions))
print("F1 Score: " + str(metrics.f1_score(validY, predictions)))
print("Precision: " + str(metrics.precision_score(validY, predictions)))

# Visualize tree
# fig = plt.figure(figsize=(50, 50))
# _ = tree.plot_tree(decisionTree, feature_names=trainX.columns, class_names=["0", "1"], filled = True)
# fig.savefig("../decision_tree.png")


The code below will create, run, test, and optimize a decision tree classifier using GridSearchCV

In [140]:
# Create GridSearch hyper-parameters
grid_params = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(5, 15),
    "random_state": range(5, 10),
    "splitter": ["best", "random"]
}

# Create decision tree
decisionTree = DecisionTreeClassifier()

# Init GridSearch and train model
gridTree = GridSearchCV(decisionTree, param_grid=grid_params, cv=5, verbose=0, n_jobs=-1)
gridTree.fit(trainX, trainY)



# Print report
print("GridSearch Tree")
print("Best params: " + str(gridTree.best_params_))
print("Best score: " + str(gridTree.best_score_))
print("----------------------------------------------------------")



GridSearch Tree
Best params: {'criterion': 'gini', 'max_depth': 6, 'random_state': 7, 'splitter': 'random'}
Best score: 0.615
----------------------------------------------------------


The code below creates a new tree with the best parameters. Then it is fitted to the combination of training and validation data. Testing is performed on the test data set

In [141]:
# Create another tree with the best parameters
bestTree = gridTree.best_estimator_
bestTree.fit(combineX, combineY)

# Print report
print("Re-trained tree")
predictions = bestTree.predict(testX)


print("Accuracy: ", metrics.accuracy_score(testY, predictions))
print("F1 Score: " + str(metrics.f1_score(testY, predictions)))
print("Precision: " + str(metrics.precision_score(testY, predictions)))

# print(metrics.classification_report(validY, predictions))
# print(metrics.confusion_matrix(validY, predictions))

Re-trained tree
Accuracy:  0.66
F1 Score: 0.6761904761904762
Precision: 0.6454545454545455
