# Modeling

Import packages

In [1]:
import numpy as np
import pandas as pd

In [2]:
# read establishment data
biz = pd.read_csv("data/interim/establishments.csv", encoding="latin-1")
biz["geo_id"] = biz["geo_id"].apply(lambda x: "%05d" % (x,))

# read immigration data
exemptions_in = pd.read_csv("data/interim/exemptions_inflow.csv", encoding="latin-1")

# compute summary statistics
exemptions_in["sum"] = exemptions_in.drop(["year", "geo_id"], axis=1).sum(axis=1)
exemptions_in["home"] = exemptions_in.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
exemptions_in["immigrants"] = exemptions_in["sum"] - exemptions_in["home"]
exemptions_in["geo_id"] = exemptions_in["geo_id"].apply(lambda x: "%05d" % (x,))

# read emigration data
exemptions_out = pd.read_csv("data/interim/exemptions_outflow.csv", encoding="latin-1")

# compute summary statistics
exemptions_out["sum"] = exemptions_out.drop("year", axis=1).drop("geo_id", axis=1).sum(axis=1)
exemptions_out["home"] = exemptions_out.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
exemptions_out["emigrants"] = exemptions_out["sum"] - exemptions_out["home"]
exemptions_out["geo_id"] = exemptions_out["geo_id"].apply(lambda x: "%05d" % (x,))

In [3]:
# aggregating immigration info by state

state_ins = exemptions_in
state_ins.columns = state_ins.columns.str[0:2]
state_ins = state_ins.set_index(["ge", "ye"]).transpose()
state_ins = state_ins.reset_index().groupby("index").sum()
state_ins = state_ins.transpose().drop("su", axis=1)
state_ins["state_immigrants"] = state_ins.max(axis=1) - state_ins["ho"]

  return self.obj.drop(self.exclusions, axis=1)


In [4]:
# create and update a samples dataframe

samples = pd.merge(exemptions_in[["ge", "ye", "su"]], state_ins.reset_index()[["ge","ye", "ho", "im", "state_immigrants"]], how="left", on=["ge", "ye"])
samples.columns = ["geo_id", "year", "population", "stayed_home", "total_immigrants", "in_state_immigrants"]


In [5]:
# aggregating emigration info by state

state_outs = exemptions_out
state_outs.columns = state_outs.columns.str[0:2]
state_outs = state_outs.set_index(["ge", "ye"]).transpose()
state_outs = state_outs.reset_index().groupby("index").sum()
state_outs = state_outs.transpose().drop("su", axis=1)
state_outs["state_emigrants"] = state_outs.max(axis=1) - state_ins["ho"]

  return self.obj.drop(self.exclusions, axis=1)


In [6]:
# update samples dataframe

samples = pd.merge(samples, state_outs.reset_index()[["ge", "ye", "em", "state_emigrants"]], how="left", left_on=["geo_id", "year"], right_on=["ge", "ye"])
samples.columns = ["geo_id", "year", "population", "stayed_home", "total_immigrants", "in_state_immigrants", "ge", "ye", "total_emigrants", "in_state_emigrants"]
samples = samples.drop(["ge", "ye"], axis=1)


# add busienss data
samples = pd.merge(biz[["geo_id", "year", "Retail trade"]], samples, how="left", on=["geo_id", "year"])

In [7]:
# set create a year column for previous year
samples["previous_year"] = samples["year"] -1

# merge year 2 data with year 1 data by matching year to "previous year" of next year
samples = pd.merge(samples, samples, how="left", left_on=["geo_id", "year"], right_on=["geo_id", "previous_year"], suffixes=("_y1", "_y2")).fillna(0)

# drop extra columns
samples = samples.drop(["previous_year_y1", "previous_year_y2"], axis=1)

# convert floats to int
samples = samples.astype(int)

# drop 2005 from table
samples = samples[samples["year_y2"] != 0]

In [8]:
samples.head(15)

Unnamed: 0,geo_id,year_y1,Retail trade_y1,population_y1,stayed_home_y1,total_immigrants_y1,in_state_immigrants_y1,total_emigrants_y1,in_state_emigrants_y1,year_y2,Retail trade_y2,population_y2,stayed_home_y2,total_immigrants_y2,in_state_immigrants_y2,total_emigrants_y2,in_state_emigrants_y2
0,1001,2005,176,42270,35901,6369,2185,5221,1883,2006,180,43578,36850,6728,2519,5647,2102
1,1001,2006,180,43578,36850,6728,2519,5647,2102,2007,176,44329,37767,6562,2394,5570,2127
2,1001,2007,176,44329,37767,6562,2394,5570,2127,2008,176,45929,39518,6411,2548,5648,2047
3,1001,2008,176,45929,39518,6411,2548,5648,2047,2009,175,46822,40719,6103,2230,5778,2118
4,1001,2009,175,46822,40719,6103,2230,5778,2118,2010,173,45868,40099,5769,2225,4943,1925
5,1001,2010,173,45868,40099,5769,2225,4943,1925,2011,168,46890,40643,6247,2331,5559,2080
6,1001,2011,168,46890,40643,6247,2331,5559,2080,2012,163,47808,41757,6051,2682,6185,2652
7,1001,2012,163,47808,41757,6051,2682,6185,2652,2013,164,47654,41552,6102,2520,5695,2216
8,1001,2013,164,47654,41552,6102,2520,5695,2216,2014,165,47191,41198,5993,2148,5974,2137
9,1001,2014,165,47191,41198,5993,2148,5974,2137,2015,169,45971,42354,3617,1735,3329,1542


In [9]:
# should i add columns for biz growth and pop growth? or ordinal values better?

In [10]:
samples["growth"] = samples["Retail trade_y1"] < samples["Retail trade_y2"]

In [11]:
samples.head(15)

Unnamed: 0,geo_id,year_y1,Retail trade_y1,population_y1,stayed_home_y1,total_immigrants_y1,in_state_immigrants_y1,total_emigrants_y1,in_state_emigrants_y1,year_y2,Retail trade_y2,population_y2,stayed_home_y2,total_immigrants_y2,in_state_immigrants_y2,total_emigrants_y2,in_state_emigrants_y2,growth
0,1001,2005,176,42270,35901,6369,2185,5221,1883,2006,180,43578,36850,6728,2519,5647,2102,True
1,1001,2006,180,43578,36850,6728,2519,5647,2102,2007,176,44329,37767,6562,2394,5570,2127,False
2,1001,2007,176,44329,37767,6562,2394,5570,2127,2008,176,45929,39518,6411,2548,5648,2047,False
3,1001,2008,176,45929,39518,6411,2548,5648,2047,2009,175,46822,40719,6103,2230,5778,2118,False
4,1001,2009,175,46822,40719,6103,2230,5778,2118,2010,173,45868,40099,5769,2225,4943,1925,False
5,1001,2010,173,45868,40099,5769,2225,4943,1925,2011,168,46890,40643,6247,2331,5559,2080,False
6,1001,2011,168,46890,40643,6247,2331,5559,2080,2012,163,47808,41757,6051,2682,6185,2652,False
7,1001,2012,163,47808,41757,6051,2682,6185,2652,2013,164,47654,41552,6102,2520,5695,2216,True
8,1001,2013,164,47654,41552,6102,2520,5695,2216,2014,165,47191,41198,5993,2148,5974,2137,True
9,1001,2014,165,47191,41198,5993,2148,5974,2137,2015,169,45971,42354,3617,1735,3329,1542,True


In [12]:
from sklearn.decomposition import PCA

In [13]:
len(samples.columns)

18

In [14]:
pca = PCA(n_components=18)
pca.fit(samples)

PCA(copy=True, iterated_power='auto', n_components=18, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [15]:
samples2 = samples.drop(["geo_id","year_y1", "year_y2", "growth", "population_y1", "population_y2", "Retail trade_y2"], axis=1)
samples2.head()

Unnamed: 0,Retail trade_y1,stayed_home_y1,total_immigrants_y1,in_state_immigrants_y1,total_emigrants_y1,in_state_emigrants_y1,stayed_home_y2,total_immigrants_y2,in_state_immigrants_y2,total_emigrants_y2,in_state_emigrants_y2
0,176,35901,6369,2185,5221,1883,36850,6728,2519,5647,2102
1,180,36850,6728,2519,5647,2102,37767,6562,2394,5570,2127
2,176,37767,6562,2394,5570,2127,39518,6411,2548,5648,2047
3,176,39518,6411,2548,5648,2047,40719,6103,2230,5778,2118
4,175,40719,6103,2230,5778,2118,40099,5769,2225,4943,1925


In [16]:
samples2.shape

(31420, 11)

In [17]:
pca = PCA(n_components=11)
pca.fit(samples2)

PCA(copy=True, iterated_power='auto', n_components=11, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [18]:
pca.explained_variance_ratio_

array([  9.98526275e-01,   1.11228002e-03,   1.62234518e-04,
         1.12323293e-04,   4.45099475e-05,   2.32069519e-05,
         9.35111522e-06,   7.90765526e-06,   9.63302166e-07,
         6.29189594e-07,   3.18739198e-07])

In [19]:
from sklearn import preprocessing

In [20]:
samples3 = preprocessing.normalize(samples2, norm='l2')

In [21]:
samples3.shape

(31420, 11)

In [22]:
pca = PCA(n_components=11)
pca.fit(samples3)

PCA(copy=True, iterated_power='auto', n_components=11, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [23]:
#pca on normalized data
pca.explained_variance_ratio_

array([ 0.36107474,  0.31056064,  0.15857888,  0.05201319,  0.04362037,
        0.02688549,  0.0222333 ,  0.01342051,  0.00512984,  0.0048416 ,
        0.00164144])

In [24]:
3.95594535e-02

0.0395594535

In [25]:
1.16036119e-02

0.0116036119

In [30]:
samples3 = samples2.drop(["stayed_home_y2", "total_immigrants_y2", "in_state_immigrants_y2", "total_emigrants_y2", "in_state_emigrants_y2"], axis=1)

In [31]:
samples3.shape

(31420, 6)

In [33]:
pca2 = PCA(n_components=6)
pca2.fit(samples3)

PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [34]:
pca2.explained_variance_ratio_

array([  9.98668694e-01,   1.13916899e-03,   1.17847341e-04,
         6.36087902e-05,   1.00077921e-05,   6.73110232e-07])

In [35]:
samples3 = preprocessing.normalize(samples3, norm='l2')

In [36]:
samples3.shape

(31420, 6)

In [42]:
pca4 = PCA(n_components=6)
pca4.fit(samples3)

PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [43]:
pca4.explained_variance_ratio_

array([ 0.5226109 ,  0.33703312,  0.05086285,  0.04712254,  0.03506683,
        0.00730375])