In [109]:
# Used in parsing data and creating decision tree
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Used in visualizing the decision tree
from sklearn import tree
from matplotlib import pyplot as plt

# Used in hyperparameter optimization (GridSearch)
from sklearn.model_selection import GridSearchCV

In [110]:
# List of file suffixes
dataFileKey = {1: "c300_d100", 
               2: "c300_d1000",
               3: "c300_d5000",
               4: "c500_d100",
               5: "c500_d1000",
               6: "c500_d5000",
               7: "c1000_d100",
               8: "c1000_d1000",
               9: "c1000_d5000",
               10: "c1500_d100",
               11: "c1500_d1000",
               12: "c1500_d5000",
               13: "c1800_d100",
               14: "c1800_d1000",
               15: "c1800_d5000"}

# File suffix key number
dataFileKeyNum:int = 15

In [111]:
# Load data sets, all from the same grouping

# Training data
filePath = "..\project2_data\\train_" + dataFileKey[dataFileKeyNum] + ".csv"
trainData = pd.read_csv(filePath, header = None)

# Test data
filePath = "..\project2_data\\test_" + dataFileKey[dataFileKeyNum] + ".csv"
testData = pd.read_csv(filePath, header = None)

# Validation data
filePath = "..\project2_data\\valid_" + dataFileKey[dataFileKeyNum] + ".csv"
validData = pd.read_csv(filePath, header = None)

In [112]:
# Split data sets into features and classes

# Training Data Split
trainX = trainData.drop(columns=[trainData.columns[-1]])
trainY = trainData.iloc[:, -1:]

# Testing Data Split
testX = testData.drop(columns=[testData.columns[-1]])
testY = testData.iloc[:, -1:]

# Validation Data Split
validX = validData.drop(columns=[validData.columns[-1]])
validY = validData.iloc[:, -1:]

# Combine train and validation data
combineX = pd.concat([trainX, validX], ignore_index=True)
combineY = pd.concat([trainY, validY], ignore_index=True)

The code below will create, run, test, and optimize a random forest classifier using GridSearchCV

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [113]:
# Create GridSearch hyper-parameters
grid_params = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(5, 15),
    "max_samples": range(1, 5000, 500)
}

# Create decision tree
randomForest = RandomForestClassifier()

# Init GridSearch and train model
gridForest = GridSearchCV(randomForest, param_grid=grid_params, cv=5, verbose=0, n_jobs=-1)
gridForest.fit(trainX, trainY)



# Print report
print("GridSearch Random Forest")
print("Best params: " + str(gridForest.best_params_))
print("Best score: " + str(gridForest.best_score_))
print("----------------------------------------------------------")



  return fit_method(estimator, *args, **kwargs)


GridSearch Random Forest
Best params: {'criterion': 'gini', 'max_depth': 5, 'max_samples': 501}
Best score: 1.0
----------------------------------------------------------


The code below creates an random forest classifier with the best parameters and is trained on the training and validation data sets. It is then tested on the test data set.

In [114]:
# Create another tree with the best parameters
bestForest = gridForest.best_estimator_
bestForest.fit(combineX, combineY)

# Print report
print("Re-trained tree")
predictions = bestForest.predict(testX)


print("Accuracy: ", metrics.accuracy_score(testY, predictions))
print("F1 Score: " + str(metrics.f1_score(testY, predictions)))
print("Precision: " + str(metrics.precision_score(testY, predictions)))

  return fit_method(estimator, *args, **kwargs)


Re-trained tree
Accuracy:  1.0
F1 Score: 1.0
Precision: 1.0
