# Random Forest Analysis for MDS, TSNE, and PCA Scalling

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.model_selection import cross_val_score
import numpy as np

### Read in the data for the training and testing information 

In [19]:
#Read in MDS scaled data
MDS = pd.read_csv('fruit_data_MDS_3D.csv', encoding='utf-8', header =0)
MDS_test = pd.read_csv('fruit_data_MDS_Test.csv', encoding='utf-8', header =0)

#Read in TSNE scaled data
TSNE = pd.read_csv('fruit_data_TSNE_3D.csv', encoding='utf-8', header =0)
TSNE_test = pd.read_csv('fruit_data_TSNE_Test.csv', encoding='utf-8', header =0)

#Read in PCA scaled data
PCA = pd.read_csv('fruit_data_PCA_3D.csv', encoding='utf-8', header =0)
PCA_test = pd.read_csv('fruit_data_PCA_Test.csv', encoding='utf-8', header =0)

MDS

Unnamed: 0,num,Number,Fruit,x,y,z
0,0,0,Apple Braeburn,-41.043967,-107.693648,-88.554706
1,1,0,Apple Braeburn,-34.765763,57.618670,-22.866320
2,2,0,Apple Braeburn,-36.508676,51.984085,-25.250339
3,3,0,Apple Braeburn,-43.754312,52.575582,-27.394515
4,4,0,Apple Braeburn,-43.819419,45.801030,-23.053853
...,...,...,...,...,...,...
67687,67687,130,Watermelon,98.937003,47.814529,148.544267
67688,67688,130,Watermelon,86.792808,70.545749,146.839740
67689,67689,130,Watermelon,140.491577,122.092144,1.310591
67690,67690,130,Watermelon,77.017471,62.021115,146.639501


### Clean up data so that it is useable 

In [20]:
#pull out the target values
MDS_target = MDS['Number']
MDS_test_target = MDS_test['Number']

TSNE_target = TSNE['Number']
TSNE_test_target = TSNE_test['Number']

PCA_target = PCA['Number']
PCA_test_target = PCA_test['Number']

#pull out the names of the fuit (indexing same for all scallings)
name_of_fruit = MDS['Fruit']

#Drop irrelevent columbs and save them to a new array
MDS_clean = MDS.drop(['num', 'Number', 'Fruit'], axis=1)
MDS_test_clean = MDS_test.drop(['num', 'Number', 'Fruit'], axis=1)

TSNE_clean = TSNE.drop(['num', 'Number', 'Fruit'], axis=1)
TSNE_test_clean = TSNE_test.drop(['num', 'Number', 'Fruit'], axis=1)

PCA_clean = PCA.drop(['num', 'Number', 'Fruit'], axis=1)
PCA_test_clean = PCA_test.drop(['num', 'Number', 'Fruit'], axis=1)

### Create RF models for all three scallings

In [23]:
#Create Random Forest model for MDS
rf_MDS = RandomForestRegressor() # Base Random Forest with no parameter tuning
rf_MDS.fit(MDS_clean, MDS_target.values.ravel()) #.values.ravel() converts MDS_target so it can be used 
rf_MDS_train_pred = rf_MDS.predict(MDS_clean)
rf_MDS_test_pred = rf_MDS.predict(MDS_test_clean)

#print accuracy scores of the trainning and test
training_mse_MDS = mean_squared_error(rf_MDS_train_pred, MDS_target)
test_mse_MDS = mean_squared_error(rf_MDS_test_pred, MDS_test_target)
cross_val_mse = -cross_val_score(rf_MDS, MDS_clean, MDS_target, scoring='neg_mean_squared_error').mean()
print(training_mse_MDS , test_mse_MDS, cross_val_mse)

199.27603989245407 1633.6779478270455 2439.8722288115882


In [24]:
#Create Random Forest model for TSNE
rf_TSNE = RandomForestRegressor() # Base Random Forest with no parameter tuning
rf_TSNE.fit(TSNE_clean, TSNE_target.values.ravel())
rf_TSNE_train_pred = rf_TSNE.predict(TSNE_clean)
rf_TSNE_test_pred = rf_TSNE.predict(TSNE_test_clean)

#print accuracy scores of the trainning and test
training_mse_TSNE = mean_squared_error(rf_TSNE_train_pred, TSNE_target)
test_mse_TSNE = mean_squared_error(rf_TSNE_test_pred, TSNE_test_target)
cross_val_mse = -cross_val_score(rf_TSNE, TSNE_clean, TSNE_target, scoring='neg_mean_squared_error').mean()
print(training_mse_TSNE , test_mse_TSNE, cross_val_mse)

182.71223366128936 1565.4014099964738 2344.8342394826695


In [25]:
#Create Random Forest model for PCA
rf_PCA = RandomForestRegressor() # Base Random Forest with no parameter tuning
rf_PCA.fit(PCA_clean, PCA_target.values.ravel())
rf_PCA_train_pred = rf_PCA.predict(PCA_clean)
rf_PCA_test_pred = rf_PCA.predict(PCA_test_clean)

#print accuracy scores of the trainning and test
training_mse_PCA = mean_squared_error(rf_PCA_train_pred, PCA_target)
test_mse_PCA = mean_squared_error(rf_PCA_test_pred, PCA_test_target)
cross_val_mse = -cross_val_score(rf_PCA, PCA_clean, PCA_target, scoring='neg_mean_squared_error').mean()
print(training_mse_PCA , test_mse_PCA, cross_val_mse)

111.06571839360633 1825.5102548219324 2615.884161947056
