# Modeling

Import packages

In [1]:
import numpy as np
import pandas as pd

## Business Data

### Business Establishment Counts

In [2]:
# read establishment data
biz = pd.read_csv("data/interim/establishments.csv", encoding="latin-1")
biz["geo_id"] = biz["geo_id"].apply(lambda x: "%05d" % (x,))

### Employee Counts

In [3]:
emp = pd.read_csv("data/interim/employees.csv", encoding="latin-1")
emp["geo_id"] = emp["geo_id"].apply(lambda x: "%05d" % (x,))

## Immigration Data

### Tax Exemptions Claimed

In [4]:
# read immigration data
exemptions_in = pd.read_csv("data/interim/exemptions_inflow.csv", encoding="latin-1")

# compute summary statistics
exemptions_in["sum"] = exemptions_in.drop(["year", "geo_id"], axis=1).sum(axis=1)
exemptions_in["home"] = exemptions_in.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
exemptions_in["immigrants"] = exemptions_in["sum"] - exemptions_in["home"]
exemptions_in["geo_id"] = exemptions_in["geo_id"].apply(lambda x: "%05d" % (x,))

# read emigration data
exemptions_out = pd.read_csv("data/interim/exemptions_outflow.csv", encoding="latin-1")

# compute summary statistics
exemptions_out["sum"] = exemptions_out.drop("year", axis=1).drop("geo_id", axis=1).sum(axis=1)
exemptions_out["home"] = exemptions_out.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
exemptions_out["emigrants"] = exemptions_out["sum"] - exemptions_out["home"]
exemptions_out["geo_id"] = exemptions_out["geo_id"].apply(lambda x: "%05d" % (x,))

#### Separating by in and out of state

In [5]:
# aggregating immigration info by state

state_ins = exemptions_in
state_ins.columns = state_ins.columns.str[0:2]
state_ins = state_ins.set_index(["ge", "ye"]).transpose()
state_ins = state_ins.reset_index().groupby("index").sum()
state_ins = state_ins.transpose().drop("su", axis=1)
state_ins["state_immigrants"] = state_ins.max(axis=1) - state_ins["ho"]

# aggregating emigration info by state

state_outs = exemptions_out
state_outs.columns = state_outs.columns.str[0:2]
state_outs = state_outs.set_index(["ge", "ye"]).transpose()
state_outs = state_outs.reset_index().groupby("index").sum()
state_outs = state_outs.transpose().drop("su", axis=1)
state_outs["state_emigrants"] = state_outs.max(axis=1) - state_ins["ho"]

  return self.obj.drop(self.exclusions, axis=1)


### Adjusted Gross Income

In [6]:
# read immigration adjusted gross income data
agi_in = pd.read_csv("data/interim/agi_inflow.csv", encoding="latin-1")

# compute summary statistics
agi_in["sum"] = agi_in.drop(["year", "geo_id"], axis=1).sum(axis=1)
agi_in["home"] = agi_in.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
agi_in["immigrants"] = agi_in["sum"] - agi_in["home"]
agi_in["geo_id"] = agi_in["geo_id"].apply(lambda x: "%05d" % (x,))

# read emigration adjusted gross income data
agi_out = pd.read_csv("data/interim/agi_outflow.csv", encoding="latin-1")

# compute summary statistics
agi_out["sum"] = agi_out.drop("year", axis=1).drop("geo_id", axis=1).sum(axis=1)
agi_out["home"] = agi_out.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
agi_out["emigrants"] = agi_out["sum"] - agi_out["home"]
agi_out["geo_id"] = agi_out["geo_id"].apply(lambda x: "%05d" % (x,))

#### Separating by in and out of state

In [7]:
# aggregating agi inflow info by state

state_agi_ins = agi_in
state_agi_ins.columns = state_agi_ins.columns.str[0:2]
state_agi_ins = state_agi_ins.set_index(["ge", "ye"]).transpose()
state_agi_ins = state_agi_ins.reset_index().groupby("index").sum()
state_agi_ins = state_agi_ins.transpose().drop("su", axis=1)
state_agi_ins["state_immigrants"] = state_agi_ins.max(axis=1) - state_agi_ins["ho"]

# aggregating agi outflow info by state

state_agi_outs = agi_out
state_agi_outs.columns = state_agi_outs.columns.str[0:2]
state_agi_outs = state_agi_outs.set_index(["ge", "ye"]).transpose()
state_agi_outs = state_agi_outs.reset_index().groupby("index").sum()
state_agi_outs = state_agi_outs.transpose().drop("su", axis=1)
state_agi_outs["state_emigrants"] = state_agi_outs.max(axis=1) - state_agi_outs["ho"]

  return self.obj.drop(self.exclusions, axis=1)


### Tax Filings

In [8]:
# read immigration adjusted gross income data
returns_in = pd.read_csv("data/interim/returns_inflow.csv", encoding="latin-1")

# compute summary statistics
returns_in["sum"] = returns_in.drop(["year", "geo_id"], axis=1).sum(axis=1)
returns_in["home"] = returns_in.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
returns_in["immigrants"] = returns_in["sum"] - returns_in["home"]
returns_in["geo_id"] = returns_in["geo_id"].apply(lambda x: "%05d" % (x,))

# read emigration adjusted gross income data
returns_out = pd.read_csv("data/interim/returns_outflow.csv", encoding="latin-1")

# compute summary statistics
returns_out["sum"] = returns_out.drop("year", axis=1).drop("geo_id", axis=1).sum(axis=1)
returns_out["home"] = returns_out.drop(["year","sum", "geo_id"], axis=1).max(axis=1)
returns_out["emigrants"] = returns_out["sum"] - returns_out["home"]
returns_out["geo_id"] = returns_out["geo_id"].apply(lambda x: "%05d" % (x,))

#### Tax filings by in and out of state

In [9]:
# aggregating agi inflow info by state

state_returns_ins = returns_in
state_returns_ins.columns = state_returns_ins.columns.str[0:2]
state_returns_ins = state_returns_ins.set_index(["ge", "ye"]).transpose()
state_returns_ins = state_returns_ins.reset_index().groupby("index").sum()
state_returns_ins = state_returns_ins.transpose().drop("su", axis=1)
state_returns_ins["state_immigrants"] = state_returns_ins.max(axis=1) - state_returns_ins["ho"]

# aggregating agi outflow info by state

state_returns_outs = returns_out
state_returns_outs.columns = state_returns_outs.columns.str[0:2]
state_returns_outs = state_returns_outs.set_index(["ge", "ye"]).transpose()
state_returns_outs = state_returns_outs.reset_index().groupby("index").sum()
state_returns_outs = state_returns_outs.transpose().drop("su", axis=1)
state_returns_outs["state_emigrants"] = state_returns_outs.max(axis=1) - state_returns_outs["ho"]

  return self.obj.drop(self.exclusions, axis=1)


## Merging Data Together

#### Business and employees

In [10]:
biz = biz[["geo_id", "year", "Real estate and rental and leasing"]]
biz.head()

Unnamed: 0,geo_id,year,Real estate and rental and leasing
0,1001,2005,35
1,1001,2006,34
2,1001,2007,30
3,1001,2008,38
4,1001,2009,37


In [11]:
emp = emp[["geo_id", "year", "Real estate and rental and leasing"]]
emp.head()

Unnamed: 0,geo_id,year,Real estate and rental and leasing
0,1001,2005,157
1,1001,2006,175
2,1001,2007,109
3,1001,2008,116
4,1001,2009,0


In [32]:
retail = pd.merge(biz, emp, how="left", left_on=["geo_id", "year"], right_on=["geo_id", "year"])
retail.columns = ["geo_id", "year", "establishments", "employees"]
retail.head()

Unnamed: 0,geo_id,year,establishments,employees
0,1001,2005,35,157
1,1001,2006,34,175
2,1001,2007,30,109
3,1001,2008,38,116
4,1001,2009,37,0


#### Tax Exemptions Claimed

In [13]:
exemptions_in = state_ins.reset_index()[["ge", "ye", "ho", "im", "state_immigrants"]]
exemptions_in["OOS_immigrants"] = exemptions_in["im"] - exemptions_in["state_immigrants"]
exemptions_in.columns = ["geo_id", "year", "exemptions_stay", "exemptions_in_total", "exemptions_in_is", "exemptions_in_oos"]
exemptions_in = exemptions_in.drop("exemptions_in_total", axis=1)
exemptions_in.head()

Unnamed: 0,geo_id,year,exemptions_stay,exemptions_in_is,exemptions_in_oos
0,1001,2005,35901.0,2185.0,4184.0
1,1001,2006,36850.0,2519.0,4209.0
2,1001,2007,37767.0,2394.0,4168.0
3,1001,2008,39518.0,2548.0,3863.0
4,1001,2009,40719.0,2230.0,3873.0


In [14]:
exemptions_out = state_outs.reset_index()[["ge", "ye", "ho", "em", "state_emigrants"]]
exemptions_out["OOS_immigrants"] = exemptions_out["em"] - exemptions_out["state_emigrants"]
exemptions_out.columns = ["geo_id", "year", "exemptions_stay", "exemptions_out_total", "exemptions_out_is", "exemptions_out_oos"]
exemptions_out = exemptions_out.drop("exemptions_out_total", axis=1)
exemptions_out.head()

Unnamed: 0,geo_id,year,exemptions_stay,exemptions_out_is,exemptions_out_oos
0,1001,2005,35901.0,1883.0,3338.0
1,1001,2006,36850.0,2102.0,3545.0
2,1001,2007,37767.0,2127.0,3443.0
3,1001,2008,39518.0,2047.0,3601.0
4,1001,2009,40719.0,2118.0,3660.0


In [15]:
exemptions = pd.merge(exemptions_in, exemptions_out, how="left", left_on=["geo_id", "year", "exemptions_stay"], right_on=["geo_id", "year", "exemptions_stay"])
exemptions.head()

Unnamed: 0,geo_id,year,exemptions_stay,exemptions_in_is,exemptions_in_oos,exemptions_out_is,exemptions_out_oos
0,1001,2005,35901.0,2185.0,4184.0,1883.0,3338.0
1,1001,2006,36850.0,2519.0,4209.0,2102.0,3545.0
2,1001,2007,37767.0,2394.0,4168.0,2127.0,3443.0
3,1001,2008,39518.0,2548.0,3863.0,2047.0,3601.0
4,1001,2009,40719.0,2230.0,3873.0,2118.0,3660.0


#### Tax Filings

In [16]:
returns_in = state_returns_ins.reset_index()[["ge", "ye", "ho", "im", "state_immigrants"]]
returns_in["OOS_immigrants"] = returns_in["im"] - returns_in["state_immigrants"]
returns_in.columns = ["geo_id", "year", "returns_stay", "returns_in_total", "returns_in_is", "returns_in_oos"]
returns_in = returns_in.drop("returns_in_total", axis=1)
returns_in.head()

Unnamed: 0,geo_id,year,returns_stay,returns_in_is,returns_in_oos
0,1001,2005,15062.0,951.0,1410.0
1,1001,2006,15473.0,1139.0,1551.0
2,1001,2007,15944.0,1072.0,1447.0
3,1001,2008,16791.0,1185.0,1458.0
4,1001,2009,17385.0,1038.0,1462.0


In [17]:
returns_out = state_returns_outs.reset_index()[["ge", "ye", "ho", "em", "state_emigrants"]]
returns_out["OOS_immigrants"] = returns_out["em"] - returns_out["state_emigrants"]
returns_out.columns = ["geo_id", "year", "returns_stay", "returns_out_total", "returns_out_is", "returns_out_oos"]
returns_out = returns_out.drop("returns_out_total", axis=1)
returns_out.head()

Unnamed: 0,geo_id,year,returns_stay,returns_out_is,returns_out_oos
0,1001,2005,15062.0,853.0,1245.0
1,1001,2006,15473.0,971.0,1307.0
2,1001,2007,15944.0,1025.0,1284.0
3,1001,2008,16791.0,989.0,1398.0
4,1001,2009,17385.0,1018.0,1375.0


In [18]:
returns = pd.merge(returns_in, returns_out, how="left", left_on=["geo_id", "year", "returns_stay"], right_on=["geo_id", "year", "returns_stay"])
returns.head()

Unnamed: 0,geo_id,year,returns_stay,returns_in_is,returns_in_oos,returns_out_is,returns_out_oos
0,1001,2005,15062.0,951.0,1410.0,853.0,1245.0
1,1001,2006,15473.0,1139.0,1551.0,971.0,1307.0
2,1001,2007,15944.0,1072.0,1447.0,1025.0,1284.0
3,1001,2008,16791.0,1185.0,1458.0,989.0,1398.0
4,1001,2009,17385.0,1038.0,1462.0,1018.0,1375.0


#### Adjusted Gross Income

In [19]:
agi_in = state_agi_ins.reset_index()[["ge", "ye", "ho", "im", "state_immigrants"]]
agi_in["OOS_immigrants"] = agi_in["im"] - agi_in["state_immigrants"]
agi_in.columns = ["geo_id", "year", "agi_stay", "agi_in_total", "agi_in_is", "agi_in_oos"]
agi_in = agi_in.drop("agi_in_total", axis=1)
agi_in.head()

Unnamed: 0,geo_id,year,agi_stay,agi_in_is,agi_in_oos
0,1001,2005,714261.0,32399.0,73510.0
1,1001,2006,756692.0,38883.0,82027.0
2,1001,2007,827611.0,37153.0,79737.0
3,1001,2008,901200.0,43332.0,83366.0
4,1001,2009,936888.0,36050.0,79697.0


In [20]:
agi_out = state_agi_outs.reset_index()[["ge", "ye", "ho", "em", "state_emigrants"]]
agi_out["OOS_emigrants"] = agi_out["em"] - agi_out["state_emigrants"]
agi_out.columns = ["geo_id", "year", "agi_stay", "agi_out_total", "agi_out_is", "agi_out_oos"]
agi_out = agi_out.drop("agi_out_total", axis=1)
agi_out.head()

Unnamed: 0,geo_id,year,agi_stay,agi_out_is,agi_out_oos
0,1001,2005,714261.0,26576.0,59705.0
1,1001,2006,756692.0,31657.0,70658.0
2,1001,2007,827611.0,36112.0,67092.0
3,1001,2008,901200.0,34063.0,69089.0
4,1001,2009,936888.0,34539.0,75861.0


In [21]:
agi = pd.merge(agi_in, agi_out, how="left", left_on=["geo_id", "year", "agi_stay"], right_on=["geo_id", "year", "agi_stay"])
agi.head()

Unnamed: 0,geo_id,year,agi_stay,agi_in_is,agi_in_oos,agi_out_is,agi_out_oos
0,1001,2005,714261.0,32399.0,73510.0,26576.0,59705.0
1,1001,2006,756692.0,38883.0,82027.0,31657.0,70658.0
2,1001,2007,827611.0,37153.0,79737.0,36112.0,67092.0
3,1001,2008,901200.0,43332.0,83366.0,34063.0,69089.0
4,1001,2009,936888.0,36050.0,79697.0,34539.0,75861.0


## Merge Business data with each migration data set

Combine retail with itself to put the target variables in the previous year's row

In [34]:
retail["year_1"] = retail["year"] - 1
samples = pd.merge(retail, retail, how="left", left_on=["geo_id", "year"], right_on=["geo_id", "year_1"], suffixes=("_y1", "_y2")).fillna(0)
samples = samples.drop(["year_1_y1", "year_1_y2"], axis=1).set_index("geo_id").astype(int).reset_index()
samples["growth"] = samples["establishments_y1"] < samples["establishments_y2"]
samples.columns = ["geo_id", "year", "establishments", "employees", "year_2", "establishments_2", "employees_2", "growth"]
samples = samples[samples["year_2"] != 0]
samples.head(5)

Unnamed: 0,geo_id,year,establishments,employees,year_2,establishments_2,employees_2,growth
0,1001,2005,35,157,2006,34,175,False
1,1001,2006,34,175,2007,30,109,False
2,1001,2007,30,109,2008,38,116,True
3,1001,2008,38,116,2009,37,0,False
4,1001,2009,37,0,2010,37,125,False


Merge samples with migration data from each dataframe

In [35]:
samples = pd.merge(samples, exemptions, how="left", on=["geo_id", "year"])
samples = pd.merge(samples, returns, how="left", on=["geo_id", "year"])
samples = pd.merge(samples, agi, how="left", on=["geo_id", "year"])
samples = samples.fillna(0)
samples.head()

Unnamed: 0,geo_id,year,establishments,employees,year_2,establishments_2,employees_2,growth,exemptions_stay,exemptions_in_is,...,returns_stay,returns_in_is,returns_in_oos,returns_out_is,returns_out_oos,agi_stay,agi_in_is,agi_in_oos,agi_out_is,agi_out_oos
0,1001,2005,35,157,2006,34,175,False,35901.0,2185.0,...,15062.0,951.0,1410.0,853.0,1245.0,714261.0,32399.0,73510.0,26576.0,59705.0
1,1001,2006,34,175,2007,30,109,False,36850.0,2519.0,...,15473.0,1139.0,1551.0,971.0,1307.0,756692.0,38883.0,82027.0,31657.0,70658.0
2,1001,2007,30,109,2008,38,116,True,37767.0,2394.0,...,15944.0,1072.0,1447.0,1025.0,1284.0,827611.0,37153.0,79737.0,36112.0,67092.0
3,1001,2008,38,116,2009,37,0,False,39518.0,2548.0,...,16791.0,1185.0,1458.0,989.0,1398.0,901200.0,43332.0,83366.0,34063.0,69089.0
4,1001,2009,37,0,2010,37,125,False,40719.0,2230.0,...,17385.0,1038.0,1462.0,1018.0,1375.0,936888.0,36050.0,79697.0,34539.0,75861.0


In [36]:
samples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31420 entries, 0 to 31419
Data columns (total 23 columns):
geo_id                31420 non-null object
year                  31420 non-null int64
establishments        31420 non-null int32
employees             31420 non-null int32
year_2                31420 non-null int32
establishments_2      31420 non-null int32
employees_2           31420 non-null int32
growth                31420 non-null bool
exemptions_stay       31420 non-null float64
exemptions_in_is      31420 non-null float64
exemptions_in_oos     31420 non-null float64
exemptions_out_is     31420 non-null float64
exemptions_out_oos    31420 non-null float64
returns_stay          31420 non-null float64
returns_in_is         31420 non-null float64
returns_in_oos        31420 non-null float64
returns_out_is        31420 non-null float64
returns_out_oos       31420 non-null float64
agi_stay              31420 non-null float64
agi_in_is             31420 non-null float64
agi_in_

In [37]:
samples = samples.drop(["geo_id", "year", "year_2", "establishments_2", "employees_2"], axis=1)
samples.to_csv("data/processed/samples.csv")