# Final Resource Analysis

### By: Shannon Gatta, Andrea Jorge, Chen Manija, Julia Zaratan

In [10]:
import numpy as np
import pandas as pd
import seaborn as sns # for visualiation
import matplotlib.pyplot as plt # plotting
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
from sklearn.model_selection import train_test_split

# scalers used 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer

# for validation
from sklearn.metrics import mean_absolute_error

# for feature selection 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_regression
from sklearn.datasets import make_regression

# models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

#for best model parameter selection
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline 

## Supress warnings thrown by any of the models
from sklearn.exceptions import DataConversionWarning
from sklearn.exceptions import ConvergenceWarning
from sklearn import warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

#load data
dems = pd.read_csv('./data/dem_with_dummies.csv', index_col = (0,1,2))

In [11]:
list(df)

dems_substrain = dems.head(600)
dem_subtest = dems.tail(dems.shape[0]- 600)

In [12]:
tr_feat, test_feat, tr_out, test_out = train_test_split(
   dems_substrain.drop(columns='Primary.Status_Advanced'), # features
   dems_substrain["Primary.Status_Advanced"],  # outcome
   test_size=0.30, # percentage of data to use as the test set
   random_state=11

   
)

## Machine Learning


### KNN Classifier

In [18]:
min_max_scaler = MinMaxScaler()
knn_reg = KNeighborsRegressor()
select_k_best = SelectKBest(f_regression)
knn_pipe = make_pipeline(min_max_scaler, select_k_best, knn_reg)

knn_param_grid = {
            'kneighborsregressor__n_neighbors': range(1, 50),
                       'kneighborsregressor__weights':["uniform", "distance"],
                       'kneighborsregressor__algorithm':['auto'],
                       'kneighborsregressor__leaf_size':[20, 30, 50, 100]
                      }

# Perform a  grid search of your pipeline
knn_grid_skb = GridSearchCV(knn_pipe, knn_param_grid)
knn_grid_skb.fit(tr_feat, tr_out)
knn_grid_skb.score(test_feat, test_out)

0.4184930411134221

In [19]:
knn_grid_skb.best_params_

{'kneighborsregressor__algorithm': 'auto',
 'kneighborsregressor__leaf_size': 20,
 'kneighborsregressor__n_neighbors': 6,
 'kneighborsregressor__weights': 'uniform'}

In [21]:
knn_tuned = KNeighborsRegressor(algorithm='auto', leaf_size=20,n_neighbors=6, weights='uniform')

In [23]:
knn_tuned.fit(tr_feat, tr_out)

KNeighborsRegressor(algorithm='auto', leaf_size=20, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=6, p=2,
          weights='uniform')

In [24]:
knn_tuned.score(test_feat, test_out)

0.5296752519596866

In [None]:
accuracy = pd.DataFrame(columns=['Model Type', 'Score'])

# rows to add
add_a_row(['KNN', knn_grid_skb.score(test_feat, test_out)*(-1)], accuracy)
add_a_row(['RFR', for_grid_skb.score(test_feat, test_out)*(-1)], accuracy)
add_a_row(['MLP', mlp_grid_skb.score(test_feat, test_out)*(-1)], accuracy)
add_a_row(['SVR', svr_grid_skb.score(test_feat, test_out)*(-1)], accuracy)