In [112]:
# Used in parsing data and creating decision tree
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics

# Used in visualizing the decision tree
from sklearn import tree
from matplotlib import pyplot as plt

# Used in hyperparameter optimization (GridSearch)
from sklearn.model_selection import GridSearchCV

In [113]:
# List of file suffixes
dataFileKey = {1: "c300_d100", 
               2: "c300_d1000",
               3: "c300_d5000",
               4: "c500_d100",
               5: "c500_d1000",
               6: "c500_d5000",
               7: "c1000_d100",
               8: "c1000_d1000",
               9: "c1000_d5000",
               10: "c1500_d100",
               11: "c1500_d1000",
               12: "c1500_d5000",
               13: "c1800_d100",
               14: "c1800_d1000",
               15: "c1800_d5000"}

# File suffix key number
dataFileKeyNum:int = 15

In [114]:
# Load data sets, all from the same grouping

# Training data
filePath = "..\project2_data\\train_" + dataFileKey[dataFileKeyNum] + ".csv"
trainData = pd.read_csv(filePath, header = None)

# Test data
filePath = "..\project2_data\\test_" + dataFileKey[dataFileKeyNum] + ".csv"
testData = pd.read_csv(filePath, header = None)

# Validation data
filePath = "..\project2_data\\valid_" + dataFileKey[dataFileKeyNum] + ".csv"
validData = pd.read_csv(filePath, header = None)

In [118]:
# Split data sets into features and classes

# Training Data Split
trainX = trainData.drop(columns=[trainData.columns[-1]])
trainY = trainData.iloc[:, -1:]

# Testing Data Split
testX = testData.drop(columns=[testData.columns[-1]])
testY = testData.iloc[:, -1:]

# Validation Data Split
validX = validData.drop(columns=[validData.columns[-1]])
validY = validData.iloc[:, -1:]

# Combine train and validation data
combineX = pd.concat([trainX, validX], ignore_index=True)
combineY = pd.concat([trainY, validY], ignore_index=True)



The code below will create, train, and optimize a Bagging Classifier with a Decision Tree as the base estimator. Optimization is performed with GridSearchCV.

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [116]:
# Hyper-parameter options
grid_params = {
    "bootstrap": [True],
    "bootstrap_features": [False],
    "max_samples": range(50, 5000, 100),
}

# Create Bagging Classifier
baggingTree = BaggingClassifier()

gridBagger = GridSearchCV(baggingTree, param_grid=grid_params, cv=5, verbose=0, n_jobs=-1)
gridBagger.fit(trainX, trainY)


# Print report
print("GridSearch Bagging Tree")
print("Best params: " + str(gridBagger.best_params_))
print("Best score: " + str(gridBagger.best_score_))
print("----------------------------------------------------------")



  y = column_or_1d(y, warn=True)


GridSearch Bagging Tree
Best params: {'bootstrap': True, 'bootstrap_features': False, 'max_samples': 4850}
Best score: 0.9938
----------------------------------------------------------


The code below will create a new bagging tree with the best paramters. It will be trained on the combination data set and tested on the test set.

In [117]:
# Create another tree with the best parameters
bestTree = gridBagger.best_estimator_
bestTree.fit(combineX, combineY)

# Print report
print("Re-trained tree")
predictions = bestTree.predict(testX)


print("Accuracy: ", metrics.accuracy_score(testY, predictions))
print("F1 Score: " + str(metrics.f1_score(testY, predictions)))
print("Precision: " + str(metrics.precision_score(testY, predictions)))

  y = column_or_1d(y, warn=True)


Re-trained tree
Accuracy:  0.9926
F1 Score: 0.9925717727363983
Precision: 0.9963724304715841
