# IRS `inflow` migration at the state level

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_stiles as altstiles
import numpy as np
import us

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Read [migration data](https://www.irs.gov/statistics/soi-tax-stats-data-by-geographic-area) from IRS

#### States metadata

In [5]:
states = pd.read_csv(
    "../usa/data/processed/states_metadata_lookup.csv", dtype={"fips": str}
)

In [6]:
states_pop = pd.read_csv(
    "../usa/data/processed/states_population.csv", dtype={"fips": str}
)

In [7]:
fips_to_name = pd.Series(states.name.values, index=states.fips).to_dict()

In [8]:
state_to_pop = pd.Series(
    states_pop.pop_acs_2020_5tr.values, index=states_pop.state
).to_dict()

#### Get the 2019-2020 tax year `INFLOW` file

In [9]:
src = pd.read_csv(
    "https://www.irs.gov/pub/irs-soi/stateinflow1920.csv",
    dtype={
        "y1_statefips": "str",
        "y1_countyfips": "str",
        "y2_statefips": "str",
        "y2_countyfips": "str",
    },
)

#### Process FIPS codes so they're useful

In [10]:
src["y1_statefips"] = src["y1_statefips"].str.zfill(2)
src["y2_statefips"] = src["y2_statefips"].str.zfill(2)

In [11]:
src.head(10)

Unnamed: 0,y2_statefips,y1_statefips,y1_state,y1_state_name,n1,n2,AGI
0,1,96,AL,AL Total Migration-US and Foreign,52265,107231,3341679
1,1,97,AL,AL Total Migration-US,51586,105521,3297568
2,1,98,AL,AL Total Migration-Foreign,679,1710,44112
3,1,97,AL,AL Total Migration-Same State,63266,129762,3419262
4,1,1,AL,AL Non-migrants,1590633,3498183,111701780
5,1,13,GA,Georgia,8322,17282,488504
6,1,12,FL,Florida,7585,14806,415647
7,1,47,TN,Tennessee,4338,8690,258108
8,1,48,TX,Texas,3544,7514,272788
9,1,28,MS,Mississippi,3082,6229,184781


In [12]:
aggregates = ["57", "58", "59", "96", "97", "98"]

In [13]:
src_df = src[
    (~src["y1_statefips"].isin(aggregates))
    & (~src["y1_state_name"].str.contains("Non-migrants"))
].copy()

In [14]:
src_df.rename(
    columns={
        "n1": "returns",
        "n2": "inflow_taxpayers",
        "AGI": "adjusted_gross_income",
        "y1_state_name": "origin_state",
        "y2_statefips": "destination_fips",
        "y1_statefips": "origin_fips",
        "y1_state": "origin_abbr",
    },
    inplace=True,
)

In [15]:
src_df.head()

Unnamed: 0,destination_fips,origin_fips,origin_abbr,origin_state,returns,inflow_taxpayers,adjusted_gross_income
5,1,13,GA,Georgia,8322,17282,488504
6,1,12,FL,Florida,7585,14806,415647
7,1,47,TN,Tennessee,4338,8690,258108
8,1,48,TX,Texas,3544,7514,272788
9,1,28,MS,Mississippi,3082,6229,184781


#### Map FIPS codes to state names

In [16]:
src_df["destination_state"] = src_df["destination_fips"].map(fips_to_name)

In [17]:
src_df["year"] = "2019-20"

In [18]:
df = src_df[
    [
        "origin_fips",
        "origin_state",
        "destination_fips",
        "destination_state",
        "returns",
        "inflow_taxpayers",
        "year",
    ]
].copy()

---

## Aggregate

#### Group by destination state and sum the inflow_taxpayers and tax returns

In [19]:
destinations_grouped = (
    df.groupby(["destination_fips", "destination_state"])
    .agg({"returns": sum, "inflow_taxpayers": sum})
    .sort_values("inflow_taxpayers", ascending=False)
    .reset_index()
)

#### Add population to the dataframe

In [20]:
destinations_grouped["population"] = destinations_grouped["destination_state"].map(
    state_to_pop
)

#### Calculate rates, if we need them

In [21]:
destinations_grouped["new_inflow_taxpayers_per_1k"] = (
    (destinations_grouped["inflow_taxpayers"] / destinations_grouped["population"])
    * 1000
).round(2)

In [22]:
destinations_grouped["new_inflow_taxpayers_share_all_movers"] = (
    (
        destinations_grouped["inflow_taxpayers"]
        / destinations_grouped["inflow_taxpayers"].sum()
    )
    * 100
).round(2)

#### Top states by rate of all movers from 2019-20

In [23]:
destinations_grouped.sort_values(
    "new_inflow_taxpayers_share_all_movers", ascending=False
).head(10)

Unnamed: 0,destination_fips,destination_state,returns,inflow_taxpayers,population,new_inflow_taxpayers_per_1k,new_inflow_taxpayers_share_all_movers
0,12,Florida,337589,623700,21216924,29.4,9.08
1,48,Texas,280783,561736,28635442,19.62,8.17
2,6,California,248447,412714,39346023,10.49,6.01
3,37,North Carolina,157714,300941,10386227,28.98,4.38
4,13,Georgia,139003,273218,10516579,25.98,3.98
5,4,Arizona,132933,247624,7174064,34.52,3.6
6,51,Virginia,129353,241747,8509358,28.41,3.52
7,36,New York,147867,228826,19514849,11.73,3.33
8,53,Washington,119286,209473,7512465,27.88,3.05
9,42,Pennsylvania,118523,207220,12794885,16.2,3.02


#### What share of the moving population landed in Texas, Cali or Florida? 

In [24]:
movers = destinations_grouped["inflow_taxpayers"].sum()

In [25]:
our_states = ["Florida", "Texas", "California"]

In [26]:
(
    (
        destinations_grouped[
            destinations_grouped["destination_state"].isin(our_states)
        ]["inflow_taxpayers"].sum()
        / movers
    )
    * 100
).round(2)

23.25

---

## Exports

In [27]:
df.to_csv("data/processed/irs-state-inflow-migration-2019_20.csv", index=False)
destinations_grouped.to_csv(
    "data/processed/irs-state-inflow-migration-2019_20_grouped.csv", index=False
)