In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
state_sightings = pd.read_pickle("state_sightings.pkl")
state_sightings.columns

Index(['State', 'sightings', 'rel0_2007', 'rel1_2007', 'rel2_2007',
       'rel3_2007', 'rel4_2007', 'rel5_2007', 'par0_2007', 'par1_2007',
       'par2_2007', 'par3_2007', 'rel0_2014', 'rel1_2014', 'rel2_2014',
       'rel3_2014', 'rel4_2014', 'rel5_2014', 'par0_2014', 'par1_2014',
       'par2_2014', 'par3_2014'],
      dtype='object')

In [3]:
X = state_sightings[['rel0_2007', 'rel1_2007', 'rel2_2007', 'rel3_2007', 'rel4_2007',
               'rel5_2007', 'par0_2007', 'par1_2007', 'par2_2007', 'par3_2007',
               'rel0_2014', 'rel1_2014', 'rel2_2014', 'rel3_2014', 'rel4_2014', 
               'rel5_2014','par0_2014', 'par1_2014', 'par2_2014', 'par3_2014']]

y = state_sightings.sightings

In [4]:
from sklearn.decomposition import PCA

for n in range(1, len(X.columns)+1):
    target_number_of_components = n

    pca = PCA(n_components=target_number_of_components)
    pca.fit(X)
    print(f"Total variance accounted using {n} components: {round(pca.explained_variance_ratio_.sum()*100,4)}%")

print(f"Total variance accounted using all components: {round(PCA(n_components=None).fit(X).explained_variance_ratio_.sum(),4)*100}%")

Total variance accounted using 1 components: 30.2436%
Total variance accounted using 2 components: 55.7693%
Total variance accounted using 3 components: 69.5186%
Total variance accounted using 4 components: 78.9674%
Total variance accounted using 5 components: 84.3377%
Total variance accounted using 6 components: 88.2992%
Total variance accounted using 7 components: 91.8248%
Total variance accounted using 8 components: 94.5325%
Total variance accounted using 9 components: 96.2299%
Total variance accounted using 10 components: 97.427%
Total variance accounted using 11 components: 98.3779%
Total variance accounted using 12 components: 98.9388%
Total variance accounted using 13 components: 99.3486%
Total variance accounted using 14 components: 99.7365%
Total variance accounted using 15 components: 99.8981%
Total variance accounted using 16 components: 100.0%
Total variance accounted using 17 components: 100.0%
Total variance accounted using 18 components: 100.0%
Total variance accounted u

In [5]:
n_pcs = pca.components_.shape[0]

most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]

initial_feature_names = X.columns

most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
df = pd.DataFrame(dic.items())
df.head(len(X.columns))

Unnamed: 0,0,1
0,PC0,rel1_2007
1,PC1,rel1_2014
2,PC2,rel2_2007
3,PC3,rel2_2014
4,PC4,par1_2007
5,PC5,par2_2007
6,PC6,par1_2014
7,PC7,par2_2014
8,PC8,rel0_2007
9,PC9,rel0_2014


In [6]:
target_number_of_components = 9

pca = PCA(n_components=target_number_of_components)
pca.fit(X)
pca.explained_variance_ratio_

array([0.30243562, 0.25525776, 0.13749224, 0.09448885, 0.05370226,
       0.03961529, 0.03525582, 0.02707757, 0.01697394])

In [7]:
pca_features = []
for p in range(1, target_number_of_components+1):
    pca_features.append("PC" + str(p))
pca_features

pca_transformed = pca.transform(X)

X_princ = pd.DataFrame(data=pca_transformed, columns = pca_features)
X_princ.head(3)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
0,0.406885,0.39133,-0.151143,0.059905,-0.015092,0.077574,-0.049307,0.039965,0.170257
1,0.024529,-0.081362,-0.068575,-0.174133,0.217762,-0.064547,-0.101341,0.24351,0.052672
2,0.038831,-0.153763,0.026344,0.22078,0.13395,-0.098967,-0.113982,0.005078,0.113823


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X_princ, y, test_size=.2, random_state=42)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((39, 20), (10, 20), (39,), (10,))

In [10]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors._regression import KNeighborsRegressor
from sklearn.tree._classes import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

results_df = pd.DataFrame(columns = ["Model", "Training RMSE", "Training R2", "Test RMSE", "Test R2"])

reg_list = [SGDRegressor(max_iter=500000), 
            LinearRegression(), 
            RandomForestRegressor(n_estimators=200), 
            KNeighborsRegressor(n_neighbors=2), 
            DecisionTreeRegressor(), 
            SVR(kernel='rbf')]

for mod in reg_list:
    reg_model = mod
    reg_model.fit(X_train, y_train)

    reg_model_pred_train = reg_model.predict(X_train)
    reg_model_pred_test = reg_model.predict(X_test)
    
    mse_train = mean_squared_error(y_train, reg_model_pred_train)
    mse_test = mean_squared_error(y_test, reg_model_pred_test)

    r2_train = r2_score(y_train, reg_model_pred_train)
    r2_test = r2_score(y_test, reg_model_pred_test)
    
    temp = pd.DataFrame({"Model": [mod],
                         "Training RMSE": [mse_train], 
                         "Training R2": [r2_train],
                         "Test RMSE": [mse_test],
                         "Test R2": [r2_test]})
    results_df = results_df.append(temp, ignore_index=True)

In [11]:
results_df.sort_values(by="Test RMSE")

Unnamed: 0,Model,Training RMSE,Training R2,Test RMSE,Test R2
5,SVR(),1825237.0,-0.091063,240007.338558,-0.308056
2,"(DecisionTreeRegressor(max_features='auto', ra...",211715.5,0.873444,502609.702893,-1.739256
3,KNeighborsRegressor(n_neighbors=2),455487.2,0.727726,550494.075,-2.000229
4,DecisionTreeRegressor(),0.0,1.0,580824.1,-2.16553
0,SGDRegressor(max_iter=500000),291245.4,0.825904,690305.407539,-2.76221
1,LinearRegression(),234091.1,0.860068,690922.0,-2.765571
