In [83]:
# IMPORTS
# Pandas is used for data manipulation
import pandas as pd
# Use numpy to convert to arrays
import numpy as np
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Utility to store and load model from disk
from sklearn.externals import joblib
# write csv files
import csv

In [59]:
# UTIL
def test_model(forest_model, test_features, test_labels):
    # Use the forest's predict method on the test data
    predictions = np.round(forest_model.predict(test_features))

    # Calculate the absolute errors
    errors = abs(predictions - test_labels)

    # Print out the mean absolute error (mae)
    print('Mean Absolute Error:', round(np.mean(errors), 2))

    # Calculate mean absolute percentage error (MAPE)
    mape = 100 * (errors / test_labels)

    # Calculate and display accuracy
    accuracy = 100 - np.mean(mape)
    print('Accuracy:', round(accuracy, 2), '%.')

    # Pull out one tree from the forest
    tree = rf.estimators_[5]

    print('The depth of this tree is:', tree.tree_.max_depth)

    total_trues = sum(x == 2 for x in test_labels)
    total_predictions = sum(x == 2 for x in predictions)
    total_errors = sum(x == 1 for x in errors)

    false_positive = sum(predict > label for predict, label in zip(predictions, test_labels))
    false_negative = sum(predict < label for predict, label in zip(predictions, test_labels))
    true_positive = total_predictions - false_positive
    precision = true_positive / total_predictions
    recall = true_positive / (true_positive + false_negative)
    print('precision:', precision)
    print('recall:', recall)

    return precision, recall

In [60]:
# Random Forest 1st model
# Read in data as pandas dataframe and display first 5 rows
features = pd.read_csv('synt_cubes_all_vote.csv')

# Remove the irrelevant texts from the features
# axis 1 refers to the columns
features = features.drop('fragmanetAndSide', axis = 1)
features = features.drop('fragment', axis = 1)
features = features.drop('fragmentAndSideTrend', axis = 1)
features = features.drop('fragmentAndSideCubes', axis = 1)
features = features.drop('origCoordinates', axis = 1)

# One-hot encode categorical features
features = pd.get_dummies(features)

# Labels are the values we want to predict
labels = np.array(features['class'])
labels = labels + 1

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('class', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25)

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

test_model(rf, test_features, test_labels)

Mean Absolute Error: 0.01
Accuracy: 99.66 %.
The depth of this tree is: 12
precision: 0.947368421053
recall: 0.885245901639


(0.94736842105263153, 0.88524590163934425)

In [61]:
# Random Forest 2nd variation of model - just for reference - not used
rf_new = RandomForestRegressor(n_estimators = 100, criterion = 'mse', max_depth = None, 
                               min_samples_split = 2, min_samples_leaf = 1)
rf_new.fit(train_features, train_labels)

test_model(rf_new, test_features, test_labels)

Mean Absolute Error: 0.01
Accuracy: 99.66 %.
The depth of this tree is: 12
precision: 0.947368421053
recall: 0.885245901639


(0.94736842105263153, 0.88524590163934425)

In [56]:
# Random Forest 3rd model - Limit depth of tree to 2 levels - not used
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3, random_state=42)
rf_small.fit(train_features, train_labels)

test_model(rf_small, test_features, test_labels)

Mean Absolute Error: 0.01
Accuracy: 99.69 %.
The depth of this tree is: 10
precision: 0.968253968254
recall: 0.884057971014


(0.96825396825396826, 0.88405797101449279)

In [63]:
# Finally - use the 1st model and this time train on the entire set 
rf.fit(features, labels);

joblib.dump(rf, 'rndFstBasic.pkl') 

['rndFstBasic.pkl']

In [77]:
# Run on the output of the voting and classify them
# Read in data as pandas dataframe
orig_features = pd.read_csv('cubes_X3_e.csv') #('real_cubes_all_vote.csv')

# Remove the irrelevant texts from the features
# axis 1 refers to the columns
features = orig_features.drop('fragmanetAndSide', axis = 1)
features = features.drop('fragment', axis = 1)
features = features.drop('fragmentAndSideTrend', axis = 1)
features = features.drop('fragmentAndSideCubes', axis = 1)
features = features.drop('origCoordinates', axis = 1)
features = features.drop("fitstFileName", axis = 1)
features = features.drop("firstCroppedWidth", axis = 1)
features = features.drop("firstOffsetX", axis = 1)
features = features.drop("firstOffsetY", axis = 1)
features = features.drop("firstHorizontalFlip", axis = 1)
features = features.drop("secondFileName", axis = 1)
features = features.drop("secondCroppedWidth", axis = 1)
features = features.drop("secondOffsetX", axis = 1)
features = features.drop("secondOffsetY", axis = 1)
features = features.drop("secondHorizontalFlip", axis = 1)

forest_model = joblib.load('rndFstBasic.pkl') 

predictions = np.round(forest_model.predict(features))-1
orig_features["class"] = predictions
filtered = orig_features[orig_features["class"] == 1]
filtered.to_csv('match_X3_e.csv', index=False)