# Clustering and counties
- Here, we will explore using clustering algorithms to find patterns by county

In [1]:
#Imports
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

from sklearn.svm import SVC

In [2]:
covid_df = pd.read_csv('./data/covid_with_alpha.csv')
gas_df = pd.read_csv('./data/gas_stations.csv')

In [3]:
covid_df.head(2)

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,log_cases,category,ordinal_cat,alpha
0,2020-10-23,Sonoma,97.0,Bodega,1,10 or fewer,-122.973889,38.345278,0.0,0,0,84.244604
1,2020-07-02,Plumas,63.0,Indian Valley,1,,-120.885787,40.081003,0.0,0,0,44.516162


In [4]:
gas_df.head(2)

Unnamed: 0,name,longitude,latitude
0,Circle K & Minimart,-117.190777,34.057365
1,,-117.225609,34.048575


In [5]:
covid_df['county'].nunique()

48

# Is it even worth it to use counties?
- only if I can use it as a predictor for gas stations
- this means that I have to see if I can predict the counties based on the latitude and longitude
    - if I can, I will do this for gas_df and create a 'county' column
    - if I can't, there is no use.

In [6]:
# for encoding the target variable as categorical: 
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

le = LabelEncoder()
le.fit(covid_df['county'])
#list(le.classes_)

covid_df['county_encoded'] = le.transform(covid_df['county'])

In [7]:
X = covid_df[['x', 'y']]
y = covid_df['county_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 22)

In [8]:
pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())

params = {
    'kneighborsclassifier__n_neighbors': [1, 2, 3],
    'kneighborsclassifier__weights': ['uniform', 'distance'],
}

grid = GridSearchCV(pipe, param_grid = params, cv = 5, n_jobs = -1)
grid.fit(X_train, y_train)
print('train score: ', grid.score(X_train, y_train))
print('test score: ', grid.score(X_test, y_test))



train score:  0.9579180509413068
test score:  0.9139072847682119


In [9]:
grid.best_params_

{'kneighborsclassifier__n_neighbors': 3,
 'kneighborsclassifier__weights': 'uniform'}

## Yes, absolutely!
- they are very predictable based on latitude and longitude

### Make gas_df 'county' column based on above

In [61]:
X_gas = gas_df[['longitude', 'latitude']]
grid.predict(X_gas)

array([29, 29, 29, ..., 30, 24, 23])

In [63]:
gas_df['county'] = list(le.inverse_transform(grid.predict(X_gas)))

In [64]:
gas_df.head()

Unnamed: 0,name,longitude,latitude,county
0,Circle K & Minimart,-117.190777,34.057365,San Bernardino
1,,-117.225609,34.048575,San Bernardino
2,,-117.156299,34.041388,San Bernardino
3,Arco,-122.420268,37.768468,San Francisco
4,Valley Oil,-122.070127,37.381165,Santa Clara


### Clean up covid_df a bit

In [69]:
covid_df.drop(columns =['fips', 'place', 'note', 'category'], inplace = True)

In [70]:
covid_df.head(1)

Unnamed: 0,date,county,confirmed_cases,x,y,log_cases,ordinal_cat,alpha
0,2020-10-23,Sonoma,1,-122.973889,38.345278,0.0,0,84.244604


# Modeling
# NOTE: Look at Shiff's notebook for real modeling results

### train/test/split

In [74]:
X = covid_df[['x', 'y', 'county']] # unfortunately adding the alpha overfit it to the training data
y = covid_df['ordinal_cat']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 22)

### Categorical vs Numeric
- I need to make a pipeline that deals with numeric and categorical features separately:
    - categorical: one hot encodes the county column
    - numeric: standard scaler

In [75]:
numeric_features = ['x', 'y']
numeric_transformer = make_pipeline(
    SimpleImputer(strategy = 'median'), 
    StandardScaler()
)

categorical_features = ['county']
categorical_transformer = make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = 'missing'), 
    OneHotEncoder(handle_unknown = 'ignore')
)

column_transformer = make_column_transformer((categorical_transformer, categorical_features), 
                    (numeric_transformer, numeric_features), remainder = 'passthrough')
    

# Gridsearch

## Logistic Regression

In [88]:
pipe = make_pipeline(column_transformer, LogisticRegression(max_iter = 1000))
params = {
    'logisticregression__C': [.01, .1, 1, 10, 100, 1000]
}
log_gs = GridSearchCV(pipe, params)

log_gs.fit(X_train, y_train)
print('train score: ', log_gs.score(X_train, y_train))
print('test score: ', log_gs.score(X_test, y_test))

train score:  0.4440753045404208
test score:  0.3741721854304636


In [89]:
log_gs.best_params_

{'logisticregression__C': 10}

## KNN

In [78]:
pipe = make_pipeline(column_transformer, KNeighborsClassifier())
params = {
    'kneighborsclassifier__n_neighbors': [1, 2, 3],
    'kneighborsclassifier__weights': ['uniform', 'distance']
}
gs = GridSearchCV(pipe, params)

gs.fit(X_train, y_train)
print('train score: ', gs.score(X_train, y_train))
print('test score: ', gs.score(X_test, y_test))

train score:  0.6821705426356589
test score:  0.3443708609271523


In [79]:
gs.best_params_

{'kneighborsclassifier__n_neighbors': 2,
 'kneighborsclassifier__weights': 'uniform'}

## SVM

In [82]:
clf = SVC()

In [85]:
pipe = make_pipeline(column_transformer, SVC())
params = {
    'svc__gamma': np.logspace(-5, 2, 10),
    'svc__C': np.logspace(-3, 2, 10),
    'svc__kernel': ['rbf', 'sigmoid', 'linear', 'poly']
}
gs = GridSearchCV(pipe, params)

gs.fit(X_train, y_train)
print('train score: ', gs.score(X_train, y_train))
print('test score: ', gs.score(X_test, y_test))

KeyboardInterrupt: 

# Using Logistic regression

In [97]:
gas_df.info()
gas_df_copy = gas_df.rename(columns = {'longitude':'x', 'latitude': 'y'})
gas_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2927 entries, 0 to 2926
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       2169 non-null   object 
 1   longitude  2927 non-null   float64
 2   latitude   2927 non-null   float64
 3   county     2927 non-null   object 
dtypes: float64(2), object(2)
memory usage: 91.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2927 entries, 0 to 2926
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    2169 non-null   object 
 1   x       2927 non-null   float64
 2   y       2927 non-null   float64
 3   county  2927 non-null   object 
dtypes: float64(2), object(2)
memory usage: 91.6+ KB


In [98]:
X_gas = gas_df_copy[['x', 'y', 'county']]
log_gs.predict(X_gas)

array([1, 3, 1, ..., 3, 3, 0])

In [99]:
gas_df_copy['ordinal_cat'] = log_gs.predict(X_gas)

In [100]:
gas_df_copy

Unnamed: 0,name,x,y,county,ordinal_cat
0,Circle K & Minimart,-117.190777,34.057365,San Bernardino,1
1,,-117.225609,34.048575,San Bernardino,3
2,,-117.156299,34.041388,San Bernardino,1
3,Arco,-122.420268,37.768468,San Francisco,2
4,Valley Oil,-122.070127,37.381165,Santa Clara,3
...,...,...,...,...,...
2922,Pilot,-117.087724,34.854655,San Bernardino,1
2923,,-117.850397,33.823337,Orange,3
2924,Shell,-117.034808,32.594424,San Diego,3
2925,Mobil,-117.738206,33.861640,Orange,3


In [101]:
gas_df_copy.to_csv('./data/gas_stations_with_cat.csv', index = False)

In [102]:
gas = pd.read_csv('./data/gas_stations_with_cat.csv')

In [103]:
gas.head()

Unnamed: 0,name,x,y,county,ordinal_cat
0,Circle K & Minimart,-117.190777,34.057365,San Bernardino,1
1,,-117.225609,34.048575,San Bernardino,3
2,,-117.156299,34.041388,San Bernardino,1
3,Arco,-122.420268,37.768468,San Francisco,2
4,Valley Oil,-122.070127,37.381165,Santa Clara,3


In [105]:
gas.isna().sum()

name           758
x                0
y                0
county           0
ordinal_cat      0
dtype: int64

In [108]:
gas.fillna(value = 'Unknown Name', inplace = True)

In [109]:
gas.head()

Unnamed: 0,name,x,y,county,ordinal_cat
0,Circle K & Minimart,-117.190777,34.057365,San Bernardino,1
1,Unknown Name,-117.225609,34.048575,San Bernardino,3
2,Unknown Name,-117.156299,34.041388,San Bernardino,1
3,Arco,-122.420268,37.768468,San Francisco,2
4,Valley Oil,-122.070127,37.381165,Santa Clara,3


In [110]:
gas.to_csv('./data/gas_stations_filled_names_with_cat.csv', index = False)

In [121]:
gas2 = pd.read_csv('./data/gas_stations_filled_names_with_cat.csv')

In [122]:
gas2.head()

Unnamed: 0,name,x,y,county,ordinal_cat
0,Circle K & Minimart,-117.190777,34.057365,San Bernardino,1
1,Unknown Name,-117.225609,34.048575,San Bernardino,3
2,Unknown Name,-117.156299,34.041388,San Bernardino,1
3,Arco,-122.420268,37.768468,San Francisco,2
4,Valley Oil,-122.070127,37.381165,Santa Clara,3


In [123]:
gas2['COVID-19 danger'] = gas2['ordinal_cat'].map({1: 'Low Risk', 2: 'Mild Risk', 3: 'Medium Risk', 4: 'High Risk', 5: 'Very High Risk'})

In [124]:
gas2.head()

Unnamed: 0,name,x,y,county,ordinal_cat,COVID-19 danger
0,Circle K & Minimart,-117.190777,34.057365,San Bernardino,1,Low Risk
1,Unknown Name,-117.225609,34.048575,San Bernardino,3,Medium Risk
2,Unknown Name,-117.156299,34.041388,San Bernardino,1,Low Risk
3,Arco,-122.420268,37.768468,San Francisco,2,Mild Risk
4,Valley Oil,-122.070127,37.381165,Santa Clara,3,Medium Risk


In [126]:
gas2.to_csv('./data/gas_stations_covid_danger.csv', index = False)