# Modeling

Import packages

In [134]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

## Business Data

### Business Establishment Counts

In [135]:
# read establishment data
biz = pd.read_csv("data/interim/establishments.csv", encoding="latin-1")
biz["geo_id"] = biz["geo_id"].apply(lambda x: "%05d" % (x,))

### Employee Counts

In [136]:
emp = pd.read_csv("data/interim/employees.csv", encoding="latin-1")
emp["geo_id"] = emp["geo_id"].apply(lambda x: "%05d" % (x,))

## Immigration Data

### Tax Exemptions Claimed

In [137]:
# read immigration data
exemptions_in = pd.read_csv("data/interim/exemptions_inflow.csv", encoding="latin-1")

# compute summary statistics
exemptions_in["sum"] = exemptions_in.drop(["year", "geo_id"], axis=1).sum(axis=1)
exemptions_in["home"] = exemptions_in.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
exemptions_in["immigrants"] = exemptions_in["sum"] - exemptions_in["home"]
exemptions_in["geo_id"] = exemptions_in["geo_id"].apply(lambda x: "%05d" % (x,))

# read emigration data
exemptions_out = pd.read_csv("data/interim/exemptions_outflow.csv", encoding="latin-1")

# compute summary statistics
exemptions_out["sum"] = exemptions_out.drop("year", axis=1).drop("geo_id", axis=1).sum(axis=1)
exemptions_out["home"] = exemptions_out.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
exemptions_out["emigrants"] = exemptions_out["sum"] - exemptions_out["home"]
exemptions_out["geo_id"] = exemptions_out["geo_id"].apply(lambda x: "%05d" % (x,))

#### Separating by in and out of state

In [138]:
# aggregating immigration info by state

state_ins = exemptions_in
state_ins.columns = state_ins.columns.str[0:2]
state_ins = state_ins.set_index(["ge", "ye"]).transpose()
state_ins = state_ins.reset_index().groupby("index").sum()
state_ins = state_ins.transpose().drop("su", axis=1)
state_ins["state_immigrants"] = state_ins.max(axis=1) - state_ins["ho"]

# aggregating emigration info by state

state_outs = exemptions_out
state_outs.columns = state_outs.columns.str[0:2]
state_outs = state_outs.set_index(["ge", "ye"]).transpose()
state_outs = state_outs.reset_index().groupby("index").sum()
state_outs = state_outs.transpose().drop("su", axis=1)
state_outs["state_emigrants"] = state_outs.max(axis=1) - state_ins["ho"]

  return self.obj.drop(self.exclusions, axis=1)


### Adjusted Gross Income

In [139]:
# read immigration adjusted gross income data
agi_in = pd.read_csv("data/interim/agi_inflow.csv", encoding="latin-1")

# compute summary statistics
agi_in["sum"] = agi_in.drop(["year", "geo_id"], axis=1).sum(axis=1)
agi_in["home"] = agi_in.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
agi_in["immigrants"] = agi_in["sum"] - agi_in["home"]
agi_in["geo_id"] = agi_in["geo_id"].apply(lambda x: "%05d" % (x,))

# read emigration adjusted gross income data
agi_out = pd.read_csv("data/interim/agi_outflow.csv", encoding="latin-1")

# compute summary statistics
agi_out["sum"] = agi_out.drop("year", axis=1).drop("geo_id", axis=1).sum(axis=1)
agi_out["home"] = agi_out.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
agi_out["emigrants"] = agi_out["sum"] - agi_out["home"]
agi_out["geo_id"] = agi_out["geo_id"].apply(lambda x: "%05d" % (x,))

#### Separating by in and out of state

In [191]:
# aggregating agi inflow info by state

state_agi_ins = agi_in
state_agi_ins.columns = state_agi_ins.columns.str[0:2]
state_agi_ins = state_agi_ins.set_index(["ge", "ye"]).transpose()
state_agi_ins = state_agi_ins.reset_index().groupby("index").sum()
state_agi_ins = state_agi_ins.transpose().drop("su", axis=1)
state_agi_ins["state_immigrants"] = state_agi_ins.max(axis=1) - state_agi_ins["ho"]

# aggregating agi outflow info by state

state_agi_outs = agi_out
state_agi_outs.columns = state_agi_outs.columns.str[0:2]
state_agi_outs = state_agi_outs.set_index(["ge", "ye"]).transpose()
state_agi_outs = state_agi_outs.reset_index().groupby("index").sum()
state_agi_outs = state_agi_outs.transpose().drop("su", axis=1)
state_agi_outs["state_emigrants"] = state_agi_outs.max(axis=1) - state_agi_outs["ho"]

  return self.obj.drop(self.exclusions, axis=1)


### Tax Filings

In [141]:
# read immigration adjusted gross income data
returns_in = pd.read_csv("data/interim/returns_inflow.csv", encoding="latin-1")

# compute summary statistics
returns_in["sum"] = returns_in.drop(["year", "geo_id"], axis=1).sum(axis=1)
returns_in["home"] = returns_in.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
returns_in["immigrants"] = returns_in["sum"] - returns_in["home"]
returns_in["geo_id"] = returns_in["geo_id"].apply(lambda x: "%05d" % (x,))

# read emigration adjusted gross income data
returns_out = pd.read_csv("data/interim/returns_outflow.csv", encoding="latin-1")

# compute summary statistics
returns_out["sum"] = returns_out.drop("year", axis=1).drop("geo_id", axis=1).sum(axis=1)
returns_out["home"] = returns_out.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
returns_out["emigrants"] = returns_out["sum"] - returns_out["home"]
returns_out["geo_id"] = returns_out["geo_id"].apply(lambda x: "%05d" % (x,))

#### Tax filings by in and out of state

In [142]:
# aggregating agi inflow info by state

state_returns_ins = returns_in
state_returns_ins.columns = state_returns_ins.columns.str[0:2]
state_returns_ins = state_returns_ins.set_index(["ge", "ye"]).transpose()
state_returns_ins = state_returns_ins.reset_index().groupby("index").sum()
state_returns_ins = state_returns_ins.transpose().drop("su", axis=1)
state_returns_ins["state_immigrants"] = state_returns_ins.max(axis=1) - state_returns_ins["ho"]

# aggregating agi outflow info by state

state_returns_outs = returns_out
state_returns_outs.columns = state_returns_outs.columns.str[0:2]
state_returns_outs = state_returns_outs.set_index(["ge", "ye"]).transpose()
state_returns_outs = state_returns_outs.reset_index().groupby("index").sum()
state_returns_outs = state_returns_outs.transpose().drop("su", axis=1)
state_returns_outs["state_emigrants"] = state_returns_outs.max(axis=1) - state_returns_outs["ho"]

  return self.obj.drop(self.exclusions, axis=1)


## Merging Data Together

#### Business and employees

In [156]:
biz = biz[["geo_id", "year", "Real estate and rental and leasing"]]
biz.head()

Unnamed: 0,geo_id,year,Real estate and rental and leasing
0,1001,2005,35
1,1001,2006,34
2,1001,2007,30
3,1001,2008,38
4,1001,2009,37


In [157]:
emp = emp[["geo_id", "year", "Real estate and rental and leasing"]]
emp.head()

Unnamed: 0,geo_id,year,Real estate and rental and leasing
0,1001,2005,157
1,1001,2006,175
2,1001,2007,109
3,1001,2008,116
4,1001,2009,0


In [162]:
retail = pd.merge(biz, emp, how="left", left_on=["geo_id", "year"], right_on=["geo_id", "year"])
retail.columns = ["geo_id", "year", "establishments", "employees"]
retail.head()

Unnamed: 0,geo_id,year,establishments,employees
0,1001,2005,35,157
1,1001,2006,34,175
2,1001,2007,30,109
3,1001,2008,38,116
4,1001,2009,37,0


#### Tax Exemptions Claimed

In [179]:
exemptions_in = state_ins.reset_index()[["ge", "ye", "ho", "im", "state_immigrants"]]
exemptions_in["OOS_immigrants"] = exemptions_in["im"] - exemptions_in["state_immigrants"]
exemptions_in.columns = ["geo_id", "year", "exemptions_stay", "exemptions_in_total", "exemptions_in_is", "exemptions_in_oos"]
exemptions_in = exemptions_in.drop("exemptions_in_total", axis=1)
exemptions_in.head()

Unnamed: 0,geo_id,year,exemptions_stay,exemptions_in_is,exemptions_in_oos
0,1001,2005,35901.0,2185.0,4184.0
1,1001,2006,36850.0,2519.0,4209.0
2,1001,2007,37767.0,2394.0,4168.0
3,1001,2008,39518.0,2548.0,3863.0
4,1001,2009,40719.0,2230.0,3873.0


In [180]:
exemptions_out = state_outs.reset_index()[["ge", "ye", "ho", "em", "state_emigrants"]]
exemptions_out["OOS_immigrants"] = exemptions_out["em"] - exemptions_out["state_emigrants"]
exemptions_out.columns = ["geo_id", "year", "exemptions_stay", "exemptions_out_total", "exemptions_out_is", "exemptions_out_oos"]
exemptions_out = exemptions_out.drop("exemptions_out_total", axis=1)
exemptions_out.head()

Unnamed: 0,geo_id,year,exemptions_stay,exemptions_out_is,exemptions_out_oos
0,1001,2005,35901.0,1883.0,3338.0
1,1001,2006,36850.0,2102.0,3545.0
2,1001,2007,37767.0,2127.0,3443.0
3,1001,2008,39518.0,2047.0,3601.0
4,1001,2009,40719.0,2118.0,3660.0


In [181]:
exemptions = pd.merge(exemptions_in, exemptions_out, how="left", left_on=["geo_id", "year", "exemptions_stay"], right_on=["geo_id", "year", "exemptions_stay"])
exemptions.head()

Unnamed: 0,geo_id,year,exemptions_stay_x,exemptions_in_is,exemptions_in_oos,exemptions_stay_y,exemptions_out_is,exemptions_out_oos
0,1001,2005,35901.0,2185.0,4184.0,35901.0,1883.0,3338.0
1,1001,2006,36850.0,2519.0,4209.0,36850.0,2102.0,3545.0
2,1001,2007,37767.0,2394.0,4168.0,37767.0,2127.0,3443.0
3,1001,2008,39518.0,2548.0,3863.0,39518.0,2047.0,3601.0
4,1001,2009,40719.0,2230.0,3873.0,40719.0,2118.0,3660.0


#### Tax Filings

In [188]:
returns_in = state_returns_ins.reset_index()[["ge", "ye", "ho", "im", "state_immigrants"]]
returns_in["OOS_immigrants"] = returns_in["im"] - returns_in["state_immigrants"]
returns_in.columns = ["geo_id", "year", "returns_stay", "returns_in_total", "returns_in_is", "returns_in_oos"]
returns_in = returns_in.drop("returns_in_total", axis=1)
returns_in.head()

Unnamed: 0,geo_id,year,returns_stay,returns_in_is,returns_in_oos
0,1001,2005,15062.0,951.0,1410.0
1,1001,2006,15473.0,1139.0,1551.0
2,1001,2007,15944.0,1072.0,1447.0
3,1001,2008,16791.0,1185.0,1458.0
4,1001,2009,17385.0,1038.0,1462.0


In [189]:
returns_out = state_returns_outs.reset_index()[["ge", "ye", "ho", "em", "state_emigrants"]]
returns_out["OOS_immigrants"] = returns_out["em"] - returns_out["state_emigrants"]
returns_out.columns = ["geo_id", "year", "returns_stay", "returns_out_total", "returns_out_is", "returns_out_oos"]
returns_out = returns_out.drop("returns_out_total", axis=1)
returns_out.head()

Unnamed: 0,geo_id,year,returns_stay,returns_out_is,returns_out_oos
0,1001,2005,15062.0,853.0,1245.0
1,1001,2006,15473.0,971.0,1307.0
2,1001,2007,15944.0,1025.0,1284.0
3,1001,2008,16791.0,989.0,1398.0
4,1001,2009,17385.0,1018.0,1375.0


In [190]:
returns = pd.merge(returns_in, returns_out, how="left", left_on=["geo_id", "year", "returns_stay"], right_on=["geo_id", "year", "returns_stay"])
returns.head()

Unnamed: 0,geo_id,year,returns_stay,returns_in_is,returns_in_oos,returns_out_is,returns_out_oos
0,1001,2005,15062.0,951.0,1410.0,853.0,1245.0
1,1001,2006,15473.0,1139.0,1551.0,971.0,1307.0
2,1001,2007,15944.0,1072.0,1447.0,1025.0,1284.0
3,1001,2008,16791.0,1185.0,1458.0,989.0,1398.0
4,1001,2009,17385.0,1038.0,1462.0,1018.0,1375.0


#### Adjusted Gross Income

In [195]:
agi_in = state_agi_ins.reset_index()[["ge", "ye", "ho", "im", "state_immigrants"]]
agi_in["OOS_immigrants"] = agi_in["im"] - agi_in["state_immigrants"]
agi_in.columns = ["geo_id", "year", "agi_stay", "agi_in_total", "agi_in_is", "agi_in_oos"]
agi_in = agi_in.drop("agi_in_total", axis=1)
agi_in.head()

Unnamed: 0,geo_id,year,agi_stay,agi_in_is,agi_in_oos
0,1001,2005,714261.0,32399.0,73510.0
1,1001,2006,756692.0,38883.0,82027.0
2,1001,2007,827611.0,37153.0,79737.0
3,1001,2008,901200.0,43332.0,83366.0
4,1001,2009,936888.0,36050.0,79697.0


In [197]:
agi_out = state_agi_outs.reset_index()[["ge", "ye", "ho", "em", "state_emigrants"]]
agi_out["OOS_emigrants"] = agi_out["em"] - agi_out["state_emigrants"]
agi_out.columns = ["geo_id", "year", "agi_stay", "agi_out_total", "agi_out_is", "agi_out_oos"]
agi_out = agi_out.drop("agi_out_total", axis=1)
agi_out.head()

Unnamed: 0,geo_id,year,agi_stay,agi_out_is,agi_out_oos
0,1001,2005,714261.0,26576.0,59705.0
1,1001,2006,756692.0,31657.0,70658.0
2,1001,2007,827611.0,36112.0,67092.0
3,1001,2008,901200.0,34063.0,69089.0
4,1001,2009,936888.0,34539.0,75861.0


In [198]:
agi = pd.merge(agi_in, agi_out, how="left", left_on=["geo_id", "year", "agi_stay"], right_on=["geo_id", "year", "agi_stay"])
agi.head()

Unnamed: 0,geo_id,year,agi_stay,agi_in_is,agi_in_oos,agi_out_is,agi_out_oos
0,1001,2005,714261.0,32399.0,73510.0,26576.0,59705.0
1,1001,2006,756692.0,38883.0,82027.0,31657.0,70658.0
2,1001,2007,827611.0,37153.0,79737.0,36112.0,67092.0
3,1001,2008,901200.0,43332.0,83366.0,34063.0,69089.0
4,1001,2009,936888.0,36050.0,79697.0,34539.0,75861.0


## Merge Business data with each migration data set

In [200]:
retail.head()

Unnamed: 0,geo_id,year,establishments,employees
0,1001,2005,35,157
1,1001,2006,34,175
2,1001,2007,30,109
3,1001,2008,38,116
4,1001,2009,37,0


In [None]:
aaa

In [None]:
Real estate and rental and leasing

In [None]:
# create and update a samples dataframe

samples = pd.merge(exemptions_in[["ge", "ye", "su"]], state_ins.reset_index()[["ge","ye", "ho", "im", "state_immigrants"]], how="left", on=["ge", "ye"])
samples.columns = ["geo_id", "year", "population", "stayed_home", "total_immigrants", "in_state_immigrants"]


In [6]:
# update samples dataframe

samples = pd.merge(samples, state_outs.reset_index()[["ge", "ye", "em", "state_emigrants"]], how="left", left_on=["geo_id", "year"], right_on=["ge", "ye"])
samples.columns = ["geo_id", "year", "population", "stayed_home", "total_immigrants", "in_state_immigrants", "ge", "ye", "total_emigrants", "in_state_emigrants"]
samples = samples.drop(["ge", "ye"], axis=1)


# add busienss data
samples = pd.merge(biz[["geo_id", "year", "Retail trade"]], samples, how="left", on=["geo_id", "year"])

In [7]:
# set create a year column for previous year
samples["previous_year"] = samples["year"] -1

# merge year 2 data with year 1 data by matching year to "previous year" of next year
samples = pd.merge(samples, samples, how="left", left_on=["geo_id", "year"], right_on=["geo_id", "previous_year"], suffixes=("_y1", "_y2")).fillna(0)

# drop extra columns
samples = samples.drop(["previous_year_y1", "previous_year_y2"], axis=1)

# convert floats to int
samples = samples.astype(int)

# drop 2005 from table
samples = samples[samples["year_y2"] != 0]

In [8]:
samples.head(15)

Unnamed: 0,geo_id,year_y1,Retail trade_y1,population_y1,stayed_home_y1,total_immigrants_y1,in_state_immigrants_y1,total_emigrants_y1,in_state_emigrants_y1,year_y2,Retail trade_y2,population_y2,stayed_home_y2,total_immigrants_y2,in_state_immigrants_y2,total_emigrants_y2,in_state_emigrants_y2
0,1001,2005,176,42270,35901,6369,2185,5221,1883,2006,180,43578,36850,6728,2519,5647,2102
1,1001,2006,180,43578,36850,6728,2519,5647,2102,2007,176,44329,37767,6562,2394,5570,2127
2,1001,2007,176,44329,37767,6562,2394,5570,2127,2008,176,45929,39518,6411,2548,5648,2047
3,1001,2008,176,45929,39518,6411,2548,5648,2047,2009,175,46822,40719,6103,2230,5778,2118
4,1001,2009,175,46822,40719,6103,2230,5778,2118,2010,173,45868,40099,5769,2225,4943,1925
5,1001,2010,173,45868,40099,5769,2225,4943,1925,2011,168,46890,40643,6247,2331,5559,2080
6,1001,2011,168,46890,40643,6247,2331,5559,2080,2012,163,47808,41757,6051,2682,6185,2652
7,1001,2012,163,47808,41757,6051,2682,6185,2652,2013,164,47654,41552,6102,2520,5695,2216
8,1001,2013,164,47654,41552,6102,2520,5695,2216,2014,165,47191,41198,5993,2148,5974,2137
9,1001,2014,165,47191,41198,5993,2148,5974,2137,2015,169,45971,42354,3617,1735,3329,1542


In [9]:
# should i add columns for biz growth and pop growth? or ordinal values better?

In [10]:
samples["growth"] = samples["Retail trade_y1"] < samples["Retail trade_y2"]

In [11]:
samples.head(15)

Unnamed: 0,geo_id,year_y1,Retail trade_y1,population_y1,stayed_home_y1,total_immigrants_y1,in_state_immigrants_y1,total_emigrants_y1,in_state_emigrants_y1,year_y2,Retail trade_y2,population_y2,stayed_home_y2,total_immigrants_y2,in_state_immigrants_y2,total_emigrants_y2,in_state_emigrants_y2,growth
0,1001,2005,176,42270,35901,6369,2185,5221,1883,2006,180,43578,36850,6728,2519,5647,2102,True
1,1001,2006,180,43578,36850,6728,2519,5647,2102,2007,176,44329,37767,6562,2394,5570,2127,False
2,1001,2007,176,44329,37767,6562,2394,5570,2127,2008,176,45929,39518,6411,2548,5648,2047,False
3,1001,2008,176,45929,39518,6411,2548,5648,2047,2009,175,46822,40719,6103,2230,5778,2118,False
4,1001,2009,175,46822,40719,6103,2230,5778,2118,2010,173,45868,40099,5769,2225,4943,1925,False
5,1001,2010,173,45868,40099,5769,2225,4943,1925,2011,168,46890,40643,6247,2331,5559,2080,False
6,1001,2011,168,46890,40643,6247,2331,5559,2080,2012,163,47808,41757,6051,2682,6185,2652,False
7,1001,2012,163,47808,41757,6051,2682,6185,2652,2013,164,47654,41552,6102,2520,5695,2216,True
8,1001,2013,164,47654,41552,6102,2520,5695,2216,2014,165,47191,41198,5993,2148,5974,2137,True
9,1001,2014,165,47191,41198,5993,2148,5974,2137,2015,169,45971,42354,3617,1735,3329,1542,True


In [118]:
samples2 = samples.drop(["geo_id","year_y1", "year_y2", "population_y1", "population_y2","stayed_home_y2", "total_immigrants_y2", "in_state_immigrants_y2", "total_emigrants_y2", "in_state_emigrants_y2"], axis=1)
samples2.head()

Unnamed: 0,Retail trade_y1,stayed_home_y1,total_immigrants_y1,in_state_immigrants_y1,total_emigrants_y1,in_state_emigrants_y1,Retail trade_y2,growth
0,176,35901,6369,2185,5221,1883,180,True
1,180,36850,6728,2519,5647,2102,176,False
2,176,37767,6562,2394,5570,2127,176,False
3,176,39518,6411,2548,5648,2047,175,False
4,175,40719,6103,2230,5778,2118,173,False


In [120]:
y = samples2["growth"]
X = samples2.drop(["growth","Retail trade_y2"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=36)

In [None]:
# Models to look at 

# logistic regression, 
# gaussian naive bayes, 
# random forest, 
# SVM


In [121]:
# Showing value of pca

# pca.explained_variance_ratio_

Model 1: Logistic Regreeions

In [122]:
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", LogisticRegression())]
parameters = {"pca__n_components":np.arange(1,7)}

In [123]:
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [124]:
cv.score(X_test, y_test)

0.66670910248249526

In [125]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      False       0.67      0.99      0.80      5174
       True       0.66      0.05      0.09      2681

avg / total       0.66      0.67      0.56      7855



Model 2: Nearest Neighbors

In [111]:
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", KNeighborsClassifier())]
parameters = {"pca__n_components":np.arange(1,7),
             "model__n_neighbors":np.arange(1,10)}

In [112]:
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [113]:
cv.score(X_test, y_test) #do this

0.65117759388924257

In [114]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      False       0.67      0.91      0.78      5174
       True       0.46      0.14      0.22      2681

avg / total       0.60      0.65      0.59      7855



In [117]:
cv.best_params_

{'model__n_neighbors': 8, 'pca__n_components': 6}