In [14]:
# Pandas is used for data manipulation
import pandas as pd

# Read in data as pandas dataframe and display first 5 rows
features = pd.read_csv('votes_cubes_match_synt.csv')
features.head(5)

Unnamed: 0,fragmanetAndSide,fragment,class,fragmentVote,fragmentAndSideVote,fragmentAndSideTrendVote,fragmentAndSideTrendVoteStrict,fragmentAndSideTrendVoteSync,fragmentAndSideTrend,fragmentAndSideCubes,origCoordinates
0,PX303Fg006_7X4_1X3_1_PX303Fg006_7X4_6X3_1,PX303Fg006_7X4_1X3_PX303Fg006_7X4_6X3,0,6,2,2,0,0,"[[False, 1, 0], [False, 1, 1]]","[[1078, 250, 590, 0], [1078, 250, 590, 250]]",[]
1,PX303Fg006_4X4_0X3_1_PX303Fg006_4X4_3X0_1,PX303Fg006_4X4_0X3_PX303Fg006_4X4_3X0,0,16,4,4,0,0,"[[False, 0, 0], [False, 0, 1], [False, 1, 0], ...","[[794, 0, 1254, 250], [794, 250, 1254, 250], [...",[]
2,PX303Fg006_7X4_1X0_0_PX303Fg006_7X4_6X3_1,PX303Fg006_7X4_1X0_PX303Fg006_7X4_6X3,0,8,4,4,6,3,"[[False, 0, 0], [False, 0, 1], [False, 1, 0], ...","[[50, 250, 590, 250], [50, 250, 590, 0], [50, ...",[]
3,PX303Fg006_8X3_2X1_0_PX303Fg006_8X3_3X1_0,PX303Fg006_8X3_2X1_PX303Fg006_8X3_3X1,0,147,4,4,6,3,"[[False, 0, 0], [False, 0, 1], [False, 1, 0], ...","[[50, 250, 50, 0], [50, 0, 50, 0], [50, 250, 5...",[]
4,PX303Fg006_3X4_0X1_0_PX303Fg006_3X4_1X3_0,PX303Fg006_3X4_0X1_PX303Fg006_3X4_1X3,0,16,4,4,6,3,"[[False, 0, 0], [False, 0, 1], [False, 1, 0], ...","[[50, 0, 50, 0], [50, 250, 50, 250], [50, 0, 5...",[]


In [15]:
print('The shape of our features is:', features.shape)

The shape of our features is: (7622, 11)


In [16]:
# Remove the irrelevant texts from the features
# axis 1 refers to the columns
features = features.drop('fragmanetAndSide', axis = 1)
features = features.drop('fragment', axis = 1)
features = features.drop('fragmentAndSideTrend', axis = 1)
features = features.drop('fragmentAndSideCubes', axis = 1)
features = features.drop('origCoordinates', axis = 1)

# One-hot encode categorical features
features = pd.get_dummies(features)
features.head(5)

Unnamed: 0,class,fragmentVote,fragmentAndSideVote,fragmentAndSideTrendVote,fragmentAndSideTrendVoteStrict,fragmentAndSideTrendVoteSync
0,0,6,2,2,0,0
1,0,16,4,4,0,0
2,0,8,4,4,6,3
3,0,147,4,4,6,3
4,0,16,4,4,6,3


In [17]:
print('Shape of features after one-hot encoding:', features.shape)

Shape of features after one-hot encoding: (7622, 6)


In [18]:
# Use numpy to convert to arrays
import numpy as np


# Labels are the values we want to predict
labels = np.array(features['class'])
labels = labels + 1

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('class', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

## Training and Testing Sets

In [19]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [20]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (5716, 5)
Training Labels Shape: (5716,)
Testing Features Shape: (1906, 5)
Testing Labels Shape: (1906,)


## Establish Baseline

In [21]:
# The baseline predictions are the historical averages
# baseline_preds = test_features[:, feature_list.index('average')]

# # Baseline errors, and display average baseline error
# baseline_errors = abs(baseline_preds - test_labels)
# print('Average baseline error: ', round(np.mean(baseline_errors), 2), 'degrees.')

## Training the Forest

In [22]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels);

## Make Predictions on Test Data

In [23]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))


Mean Absolute Error: 0.0


In [24]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 99.68 %.


## Visualizing a Single Decision Tree

In [44]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf.estimators_[5]

# # Export the image to a dot file
# export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# # Use dot file to create a graph
# (graph, ) = pydot.graph_from_dot_file('tree.dot')

# # Write graph to a png file
# graph.write_png('tree.png'); 

![Decision Tree](tree.png)

In [45]:
print('The depth of this tree is:', tree.tree_.max_depth)

The depth of this tree is: 9


We can create models with different hyperparameters to try and boost performance. The only way to find the best ones
are to try a few and evaluate them! 

In [46]:
rf_new = RandomForestRegressor(n_estimators = 100, criterion = 'mse', max_depth = None, 
                               min_samples_split = 2, min_samples_leaf = 1)
rf_new.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [60]:
# Use the forest's predict method on the test data
predictions = rf_new.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))


Mean Absolute Error: 0.0


In [48]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 99.67 %.


Smaller tree for visualization.

In [49]:
tree = rf_new.estimators_[5]
print('The depth of this tree is:', tree.tree_.max_depth)

The depth of this tree is: 11


In [51]:
# Limit depth of tree to 2 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3, random_state=42)
rf_small.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [52]:
# Use the forest's predict method on the test data
predictions = rf_small.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))


Mean Absolute Error: 0.01


In [53]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 99.48 %.


In [54]:
predictions2 = np.round(predictions)

# Calculate the absolute errors
errors = abs(predictions2 - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.sum(errors), 2))

Mean Absolute Error: 4.0


In [55]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 99.87 %.


In [61]:
match_indexes = np.nonzero(errors)

In [62]:
falsed = test_labels[match_indexes]
falsed

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [63]:
sum(x == 2 for x in test_labels)

60

In [59]:
# Extract the small tree
tree = rf_new.estimators_[5]
print('The depth of this tree is:', tree.tree_.max_depth)

# # Save the tree as a png image
# export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# (graph, ) = pydot.graph_from_dot_file('small_tree.dot')

# graph.write_png('small_tree.png')

The depth of this tree is: 11


![Small Decision Tree](small_tree.PNG)

### Annotated Version of Tree

![Annotated Decision Tree](small_tree_annotated.PNG)

## Variable Importances

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

### Two Most Important Features

In [None]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Extract the two most important features
important_indices = [feature_list.index('temp_1'), feature_list.index('average')]
train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]

# Train the random forest
rf_most_important.fit(train_important, train_labels)

# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)

errors = abs(predictions - test_labels)

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape

print('Accuracy:', round(accuracy, 2), '%.')

## Visualizations

In [None]:
# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt

%matplotlib inline

# Set the style
plt.style.use('fivethirtyeight')

# list of x locations for plotting
x_values = list(range(len(importances)))

# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')

# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')

# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances'); 

In [None]:
import datetime

# Dates of training values
months = features[:, feature_list.index('month')]
days = features[:, feature_list.index('day')]
years = features[:, feature_list.index('year')]

# List and then convert to datetime object
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates]

# Dataframe with true values and dates
true_data = pd.DataFrame(data = {'date': dates, 'actual': labels})

# Dates of predictions
months = test_features[:, feature_list.index('month')]
days = test_features[:, feature_list.index('day')]
years = test_features[:, feature_list.index('year')]

# Column of dates
test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]

# Convert to datetime objects
test_dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in test_dates]

# Dataframe with predictions and dates
predictions_data = pd.DataFrame(data = {'date': test_dates, 'prediction': predictions}) 

In [None]:
# Plot the actual values
plt.plot(true_data['date'], true_data['actual'], 'b-', label = 'actual')

# Plot the predicted values
plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label = 'prediction')
plt.xticks(rotation = '60'); 
plt.legend()

# Graph labels
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Actual and Predicted Values');


In [None]:
# Make the data accessible for plotting
true_data['temp_1'] = features[:, feature_list.index('temp_1')]
true_data['average'] = features[:, feature_list.index('average')]
true_data['friend'] = features[:, feature_list.index('friend')]

# Plot all the data as lines
plt.plot(true_data['date'], true_data['actual'], 'b-', label  = 'actual', alpha = 1.0)
plt.plot(true_data['date'], true_data['temp_1'], 'y-', label  = 'temp_1', alpha = 1.0)
plt.plot(true_data['date'], true_data['average'], 'k-', label = 'average', alpha = 0.8)
plt.plot(true_data['date'], true_data['friend'], 'r-', label = 'friend', alpha = 0.3)

# Formatting plot
plt.legend(); plt.xticks(rotation = '60');

# Lables and title
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Actual Max Temp and Variables');