In [1]:
import pandas as pd
import numpy as np
import scipy as sc
from IPython.display import display
import folium
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import SMOTE,RandomOverSampler
from sklearn.metrics import recall_score,precision_score


In [2]:
amz = pd.read_csv("Data/CSV//amazon_labelled_locations.csv",thousands=",")
moodys = pd.read_csv("Data/CSV/MoodysData-ACS.csv",thousands=",")
us_county = pd.read_csv("Data/national_county.txt",names=["state","state_code","county_code","county","FIPS_class_code"],dtype=str)

In [3]:
us_state_abbrev = {"Alabama": "AL","Alaska": "AK","Arizona": "AZ","Arkansas": "AR","California": "CA","Colorado": "CO",
                   "Connecticut": "CT","Delaware": "DE","Florida": "FL","Georgia": "GA","Hawaii": "HI","Idaho": "ID",
                   "Illinois": "IL","Indiana": "IN","Iowa": "IA","Kansas": "KS","Kentucky": "KY","Louisiana": "LA",
                   "Maine": "ME","Maryland": "MD","Massachusetts": "MA","Michigan": "MI","Minnesota": "MN",
                   "Mississippi": "MS","Missouri": "MO","Montana": "MT","Nebraska": "NE","Nevada": "NV",
                   "New Hampshire": "NH","New Jersey": "NJ","New Mexico": "NM","New York": "NY","North Carolina": "NC",
                   "North Dakota": "ND","Ohio": "OH","Oklahoma": "OK","Oregon": "OR","Pennsylvania": "PA",
                   "Rhode Island": "RI","South Carolina": "SC","South Dakota": "SD","Tennessee": "TN","Texas": "TX",
                   "Utah": "UT","Vermont": "VT","Virginia": "VA","Washington": "WA","West Virginia": "WV",
                   "Wisconsin": "WI","Wyoming": "WY"}

In [4]:
display(amz.head(1))
display(moodys.head(1))
display(us_county.head(1))

Unnamed: 0,State,Code,Location,Square Feet,Year Opened,Description of Operation,Type,County,City,Year
0,Arizona,DPX1,"500 S. 48th Street, Phoenix, Arizona, USA, 85034",,May 2015,Delivery Station for Phoenix West Valley Co-lo...,Delivery Station,Maricopa County,Phoenix,2015.0


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Code,Include in Amazon Dataset,Years,Current Senior Most Rating*,Tax Backed Rating Description,State,Population (ACS Data),Per Capita Income (ACS Data),Median Family Income (ACS Data),Median Home Value (ACS Data),Median Gross Rent (ACS Data),Occupied Housing Units (ACS Data),Seasonal Homes (ACS Data),No. Persons/Household (ACS Data),Median Age (ACS Data),% Below Poverty Level (ACS Data)
0,Median,Median,,,2017.0,,,,,,,,,,,,,


Unnamed: 0,state,state_code,county_code,county,FIPS_class_code
0,AL,1,1,Autauga County,H1


In [None]:
us_county["geoid"]="050"+"0000"+"US"+us_county["state_code"]+us_county["county_code"]
us_county["county_state"]=us_county["county"]+", "+us_county["state"]
us_county_trimmed = us_county.drop(["state","state_code","county_code","county","FIPS_class_code"],axis=1)


moodys_2012 = moodys[moodys["Years"]==2012]
moodys_2012 = moodys_2012.rename(columns={"Unnamed: 1":"county_state"})
moodys_2012 = moodys_2012.drop(["Unnamed: 0"],axis=1)


amz["county_state"]=amz.apply(axis=1,func=lambda row:str(row["County"])+", " +us_state_abbrev[row["State"]])
amz_trimmed = amz[["county_state"]]
#t1 = pd.DataFrame(np.ones((amz_trimmed.shape[0],1)),columns=["warehouse"])
amz_trimmed=amz_trimmed.assign(warehouse=1)
#amz_trimmed["warehouse"]=t1["warehouse"]
t1 = pd.merge(how="left",left=moodys_2012,right=amz_trimmed,left_on="county_state",right_on="county_state")


t1["warehouse"].fillna(0,inplace=True)


t2 = pd.merge(how="left",left =t1,left_on="county_state",right=us_county_trimmed,right_on="county_state")
#don't remove dupes add them to a new count column

final= t2[t2["geoid"].notnull()]

In [None]:
state_geo = 'Data\\cb_2013_us_county_5m.geojson'


m1 = folium.Map(location=[40, -100], zoom_start=4.5)
#m.choropleth(geo_data=state_geo, line_color='blue',line_weight=3)
m1.choropleth(
    geo_data=state_geo,
    name='choropleth',
    data=final,
    columns=['geoid', 'warehouse'],
    key_on='feature.properties.AFFGEOID',
    fill_color='RdBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Existing Amazon warehouses"
)


folium.LayerControl().add_to(m1)
m1.save("map1.html")
m1

In [None]:
# train a logistic regression model and then run it on everything
import warnings
warnings.simplefilter("ignore")

data_1 = final.drop(["geoid","county_state","State","Code","Include in Amazon Dataset","Years","Current Senior Most Rating*","Tax Backed Rating Description","No. Persons/Household (ACS Data)"],axis=1)
data=data_1.dropna()

x=data.ix[:,0:-1]
y=data.ix[:,-1]

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)

models={}



grid_grid={"logisticregression__solver":["lbfgs","liblinear","newton-cg"],"logisticregression__C":np.logspace(-4,4,10)}


#basic logistic regression
pipe_1 = make_pipeline(StandardScaler(),LogisticRegression())

model_1 = GridSearchCV(pipe_1,grid_grid,cv=3,scoring="f1")
model_1.fit(x_train,y_train)
models["basic_logistic"]=model_1



#Random undersampling
#x_train_samp,y_train_samp = RandomUnderSampler(x_train,y_train)
x_train_under,y_train_under = RandomUnderSampler(random_state=42).fit_sample(x_train,y_train)
pipe_2 = make_pipeline(StandardScaler(),LogisticRegression())
model_2 = GridSearchCV(pipe_2,grid_grid,cv=5,scoring="f1")
model_2.fit(x_train_under,y_train_under)
models["random_undersampled"]=model_2

#Smote 
x_train_smote,y_train_smote = SMOTE(random_state=42).fit_sample(x_train,y_train)
pipe_3 = make_pipeline(StandardScaler(),LogisticRegression())
model_3 = GridSearchCV(pipe_3,grid_grid,cv=5,scoring="f1")
model_3.fit(x_train_smote,y_train_smote)
models["SMOTE"]=model_3


#Random oversampling
x_train_over,y_train_over = RandomOverSampler(random_state=42).fit_sample(x_train,y_train)
pipe_4 = make_pipeline(StandardScaler(),LogisticRegression())
model_4 = GridSearchCV(pipe_4,grid_grid,cv=5,scoring="f1")
model_4.fit(x_train_over,y_train_over)
models["random_oversampled"]=model_4



#TODO: mutual information


junk =[]
for k,v in models.items():
    stuff =[]
    stuff.append(k)
    m = models[k]
    y_pred = m.predict(x_test)
    stuff.append(m.score(x_test,y_test))
    stuff.append(recall_score(y_test,y_pred))
    stuff.append(precision_score(y_test,y_pred))
    junk.append(stuff)
m_frame = pd.DataFrame(junk,columns=["model", "F1_score","recall","precision"])
display(m_frame)

In [None]:
model=model_1

#plot tp fp fn
probabilities = model.predict_proba(x)[:,1]
predicted = model.predict(x)
data_plus_proba = final.copy()

#drop the same rows we dropped from the "data" frame
data_plus_proba=data_plus_proba.dropna(how="all",subset=data.columns[:-1])

data_plus_proba["proba"]=probabilities
data_plus_proba["predicted"]=predicted

In [None]:
def my_color_function(feature):
    """map to green if prediction and warehouse are both 1, map to blue 
    if warehouse but not predicted, map to red if predicted but no warehouse"""
    geoid = feature["properties"]["AFFGEOID"]
    #there are counties with bounding information we don't have in our dataset
    #make them black
    raw_row = data_plus_proba[data_plus_proba["geoid"]==geoid]
    if raw_row.shape[0]==0:
        return "#000000"
    row = raw_row.iloc[0]
    if row["warehouse"]==1 and row["predicted"]==1:
        return "#006600"
    elif row["warehouse"]==1 and row["predicted"]==0:
        return "#000099"
    elif row["warehouse"]==0 and row["predicted"]==1:
        return "#CC0000"
    else:
        return "#FFFFFF"

In [None]:
#predicted results
m2 = folium.Map(location=[40, -100], zoom_start=4.5)

g1 = folium.features.GeoJson(state_geo,
        style_function=lambda feature: {
        'fillColor': my_color_function(feature),
        'color': 'black',
        "weight":1,
        "opacity":.2
    })

g1.add_to(m2)
m2.save("map2.html")
m2

In [None]:
#probabilities
m3 = folium.Map(location=[40, -100], zoom_start=4.5)
m3.choropleth(
    geo_data=state_geo,
    name='choropleth',
    data=data_plus_proba,
    columns=['geoid', 'proba'],
    key_on='feature.properties.AFFGEOID',
    fill_color='RdBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Probability of receving amazon warehouse"
)


folium.LayerControl().add_to(m3)
m3.save("map3.html")
m3

