# Regression Analysis for MDS, TSNE, and PCA Scalling 

In [28]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
%config Completer.use_jedi = False

###  Read in data

In [5]:
#Read in MDS scaled data
MDS = pd.read_csv('fruit_data_MDS_3D.csv', encoding='utf-8', header =0)
MDS_test = pd.read_csv('fruit_data_MDS_Test.csv', encoding='utf-8', header =0)

#Read in TSNE scaled data
TSNE = pd.read_csv('fruit_data_TSNE_3D.csv', encoding='utf-8', header =0)
TSNE_test = pd.read_csv('fruit_data_TSNE_Test.csv', encoding='utf-8', header =0)

#Read in PCA scaled data
PCA = pd.read_csv('fruit_data_PCA_3D.csv', encoding='utf-8', header =0)
PCA_test = pd.read_csv('fruit_data_PCA_Test.csv', encoding='utf-8', header =0)

MDS

Unnamed: 0,num,Number,Fruit,x,y,z
0,0,0,Apple Braeburn,-41.043967,-107.693648,-88.554706
1,1,0,Apple Braeburn,-34.765763,57.618670,-22.866320
2,2,0,Apple Braeburn,-36.508676,51.984085,-25.250339
3,3,0,Apple Braeburn,-43.754312,52.575582,-27.394515
4,4,0,Apple Braeburn,-43.819419,45.801030,-23.053853
...,...,...,...,...,...,...
67687,67687,130,Watermelon,98.937003,47.814529,148.544267
67688,67688,130,Watermelon,86.792808,70.545749,146.839740
67689,67689,130,Watermelon,140.491577,122.092144,1.310591
67690,67690,130,Watermelon,77.017471,62.021115,146.639501


### Clean up data so that it is useable 

In [7]:
#pull out the target values
MDS_target = MDS['Number']
MDS_test_target = MDS_test['Number']

TSNE_target = TSNE['Number']
TSNE_test_target = TSNE_test['Number']

PCA_target = PCA['Number']
PCA_test_target = PCA_test['Number']

#pull out the names of the fuit (indexing same for all scallings)
name_of_fruit = MDS['Fruit']

#Drop irrelevent columbs and save them to a new array
MDS_clean = MDS.drop(['num', 'Number', 'Fruit'], axis=1)
MDS_test_clean = MDS_test.drop(['num', 'Number', 'Fruit'], axis=1)

TSNE_clean = TSNE.drop(['num', 'Number', 'Fruit'], axis=1)
TSNE_test_clean = TSNE_test.drop(['num', 'Number', 'Fruit'], axis=1)

PCA_clean = PCA.drop(['num', 'Number', 'Fruit'], axis=1)
PCA_test_clean = PCA_test.drop(['num', 'Number', 'Fruit'], axis=1)

### Create RF models for all three scallings

In [19]:
#create logistic regression for MDS
lg_MDS = LogisticRegression(max_iter = 100000) #increases the number of itterations that the model does from 100 --> 100,000
lg_MDS.fit(MDS_clean, MDS_target)
lg_MDS_train_pred = lg_MDS.predict(MDS_clean)
lg_MDS_test_pred = lg_MDS.predict(MDS_test_clean)


#print accuracy scores of the trainning and test
training_acc_MDS = accuracy_score(lg_MDS_train_pred, MDS_target)
test_acc_MDS = accuracy_score(lg_MDS_test_pred, MDS_test_target)
print(training_acc_MDS , test_acc_MDS)

0.01453642971104414 0.014456981664315938


In [31]:
#create logistic regression for TSNE
lg_TSNE = LogisticRegression(max_iter = 100000)
lg_TSNE.fit(TSNE_clean, TSNE_target)
lg_TSNE_train_pred = lg_TSNE.predict(TSNE_clean)
lg_TSNE_test_pred = lg_TSNE.predict(TSNE_test_clean)


#print accuracy scores of the trainning and test
training_acc_TSNE = accuracy_score(lg_TSNE_train_pred, TSNE_target)
test_acc_TSNE = accuracy_score(lg_TSNE_test_pred, TSNE_test_target)
print(training_acc_TSNE , test_acc_TSNE)

0.01458074809430952 0.014501057827926657


In [21]:
#create logistic regression for MDS
lg_PCA = LogisticRegression(max_iter = 100000)
lg_PCA.fit(PCA_clean, PCA_target)
lg_PCA_train_pred = lg_PCA.predict(PCA_clean)
lg_PCA_test_pred = lg_PCA.predict(PCA_test_clean)


#print accuracy scores of the trainning and test
training_acc_PCA = accuracy_score(lg_PCA_train_pred, PCA_target)
test_acc_PCA = accuracy_score(lg_PCA_test_pred, PCA_test_target)
print(training_acc_PCA , test_acc_PCA)

0.01453642971104414 0.014456981664315938


In [43]:
#take the best one and tweak 
lg_TSNE = LogisticRegression(max_iter = 1000000)
lg_TSNE.fit(TSNE_clean, TSNE_target)
lg_TSNE_train_pred = lg_TSNE.predict(TSNE_clean)
lg_TSNE_test_pred = lg_TSNE.predict(TSNE_test_clean)


#print accuracy scores of the trainning and test
training_acc_TSNE = accuracy_score(lg_TSNE_train_pred, TSNE_target)
test_acc_TSNE = accuracy_score(lg_TSNE_test_pred, TSNE_test_target)
print(training_acc_TSNE , test_acc_TSNE)

0.01458074809430952 0.014501057827926657
