# Principal component analysis (PCA) 

Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. The input data is centered but not scaled for each feature before applying the SVD.

It uses the LAPACK implementation of the full SVD or a randomized truncated SVD by the method of Halko et al. 2009, depending on the shape of the input data and the number of components to extract.

It can also use the scipy.sparse.linalg ARPACK implementation of the truncated SVD.

Notice that this class does not support sparse input. See TruncatedSVD for an alternative with sparse data.



https://scikit-learn.org/stable/auto_examples/neighbors/plot_nca_dim_reduction.html?highlight=principal%20component%20analysis



In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
import graphviz 
import math
from sklearn.decomposition import PCA


In [None]:
epa = pd.read_csv('https://raw.githubusercontent.com/sqlshep/SQLShepBlog/master/data/epaMpg.csv')
epa

In [None]:

#Drop the row number
epa = epa.drop(epa.columns[[0]], axis=1)

#replace the "." in the column names with "_"
epa.columns = epa.columns.str.replace('.', '_')

# Drop useless columns
epa = epa.drop(epa.columns[[0,1,2]], axis=1)
epa = epa.drop(epa.columns[[3,9,11]], axis=1)

epa['Tested_Transmission_Type_Code']= epa['Tested_Transmission_Type_Code'].astype('category')    
epa['Drive_System_Code']= epa['Drive_System_Code'].astype('category')

#One hot encode categories
epa = pd.get_dummies(epa)

In [None]:
epa


In [None]:
#epa_X = epa.iloc[:, epa.columns =='Weight']
epa_X = epa.iloc[:, epa.columns !='FuelEcon']
epa_y = epa.iloc[:, epa.columns =='FuelEcon']

In [None]:
pca = PCA(n_components=8)

In [None]:
principalComponents = pca.fit_transform(epa_X)

In [None]:
principalDf = pd.DataFrame(data = principalComponents, columns = ['V1', 'V2','V3','V4','V5','V6','V7','V8'])

In [None]:
principalDf

In [None]:
plot = pd.plotting.scatter_matrix(principalDf,figsize=(15,15))

In [None]:
sns.set(rc={'figure.figsize':(12,8)})
sns.heatmap(principalDf.corr(), annot = True)

Now that the data has been run throgu PCA, split and run RandomForestRegressor


In [None]:
# Split the training and test set 
X_train, X_test, y_train, y_test = train_test_split(principalDf, epa_y, test_size=0.20)

In [None]:
epa_forest = RandomForestRegressor()

In [None]:
epa_forest.fit(X_train, y_train)

In [None]:
epa_y_pred = epa_forest.predict(X_test)

In [None]:
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, epa_y_pred))

# The root mean squared error
print('Root Mean squared error: %.2f'
      % math.sqrt(mean_squared_error(y_test, epa_y_pred)))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, epa_y_pred))