# IRS `outflow` migration at the state level

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_stiles as altstiles
import numpy as np
import us

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Read [migration data](https://www.irs.gov/statistics/soi-tax-stats-data-by-geographic-area) from IRS

#### States metadata

In [5]:
states = pd.read_csv(
    "../usa/data/processed/states_metadata_lookup.csv", dtype={"fips": str}
)

In [6]:
states_pop = pd.read_csv(
    "../usa/data/processed/states_population.csv", dtype={"fips": str}
)

In [7]:
fips_to_name = pd.Series(states.name.values, index=states.fips).to_dict()

In [8]:
state_to_pop = pd.Series(
    states_pop.pop_acs_2020_5tr.values, index=states_pop.state
).to_dict()

#### Get the 2019-2020 tax year `OUTFLOW` file

In [9]:
src = pd.read_csv(
    "https://www.irs.gov/pub/irs-soi/stateoutflow1920.csv",
    dtype={
        "y1_statefips": "str",
        "y1_countyfips": "str",
        "y2_statefips": "str",
        "y2_countyfips": "str",
    },
)

#### Process FIPS codes so they're useful

In [10]:
src["y1_statefips"] = src["y1_statefips"].str.zfill(2)
src["y2_statefips"] = src["y2_statefips"].str.zfill(2)

In [12]:
aggregates = ["57", "58", "59", "96", "97", "98"]

In [13]:
src_df = src[
    (~src["y2_statefips"].isin(aggregates))
    & (~src["y2_state_name"].str.contains("Non-migrants"))
].copy()

In [14]:
src_df.rename(
    columns={
        "n1": "returns",
        "n2": "outflow_taxpayers",
        "AGI": "adjusted_gross_income",
        "y1_state_name": "origin_state",
        "y2_statefips": "destination_fips",
        "y1_statefips": "origin_fips",
        "y1_state": "origin_abbr",
        "y2_state": "destination_abbr",
        "y2_state_name": "destination_state",
    },
    inplace=True,
)

In [15]:
src_df.head()

Unnamed: 0,origin_fips,destination_fips,destination_abbr,destination_state,returns,outflow_taxpayers,adjusted_gross_income
5,1,13,GA,Georgia,7830,15636,433366
6,1,12,FL,Florida,6902,13551,517167
7,1,47,TN,Tennessee,4109,7842,293025
8,1,48,TX,Texas,3803,7814,254186
9,1,28,MS,Mississippi,2431,4933,117760


#### Map FIPS codes to state names

In [16]:
src_df["origin_state"] = src_df["origin_fips"].map(fips_to_name)

In [17]:
src_df["year"] = "2019-20"

In [18]:
df = src_df[
    [
        "origin_fips",
        "origin_state",
        "destination_fips",
        "destination_state",
        "returns",
        "outflow_taxpayers",
        "year",
    ]
].copy()

In [19]:
df.head()

Unnamed: 0,origin_fips,origin_state,destination_fips,destination_state,returns,outflow_taxpayers,year
5,1,Alabama,13,Georgia,7830,15636,2019-20
6,1,Alabama,12,Florida,6902,13551,2019-20
7,1,Alabama,47,Tennessee,4109,7842,2019-20
8,1,Alabama,48,Texas,3803,7814,2019-20
9,1,Alabama,28,Mississippi,2431,4933,2019-20


---

## Aggregate

#### Group by origin state and sum the `outflow_taxpayers` and tax returns

In [20]:
origins_grouped = (
    df.groupby(["origin_fips", "origin_state"])
    .agg({"returns": sum, "outflow_taxpayers": sum})
    .sort_values("outflow_taxpayers", ascending=False)
    .reset_index()
)

#### Add population to the dataframe

In [21]:
origins_grouped["population"] = origins_grouped["origin_state"].map(state_to_pop)

#### Calculate rates, if we need them

In [22]:
origins_grouped["lost_outflow_taxpayers_per_1k"] = (
    (origins_grouped["outflow_taxpayers"] / origins_grouped["population"]) * 1000
).round(2)

In [23]:
origins_grouped["lost_outflow_taxpayers_share_all_movers"] = (
    (origins_grouped["outflow_taxpayers"] / origins_grouped["outflow_taxpayers"].sum())
    * 100
).round(2)

#### Top states by rate of all movers from 2019-20

In [24]:
origins_grouped.sort_values(
    "lost_outflow_taxpayers_share_all_movers", ascending=False
).head(10)

Unnamed: 0,origin_fips,origin_state,returns,outflow_taxpayers,population,lost_outflow_taxpayers_per_1k,lost_outflow_taxpayers_share_all_movers
0,6,California,365922,676058,39346023,17.18,9.86
1,36,New York,278489,477131,19514849,24.45,6.96
2,12,Florida,256188,456993,21216924,21.54,6.66
3,48,Texas,218116,428286,28635442,14.96,6.25
4,17,Illinois,151628,272632,12716164,21.44,3.98
5,51,Virginia,133148,251296,8509358,29.53,3.66
6,13,Georgia,121665,236144,10516579,22.45,3.44
7,37,North Carolina,121628,232767,10386227,22.41,3.39
8,42,Pennsylvania,121867,212086,12794885,16.58,3.09
9,34,New Jersey,118511,208326,8885418,23.45,3.04


#### What share of the moving population landed in Texas, Cali or Florida? 

In [25]:
movers = origins_grouped["outflow_taxpayers"].sum()

In [26]:
our_states = ["Florida", "Texas", "California"]

In [27]:
(
    (
        origins_grouped[origins_grouped["origin_state"].isin(our_states)][
            "outflow_taxpayers"
        ].sum()
        / movers
    )
    * 100
).round(2)

22.77

---

## Exports

In [28]:
df.to_csv("data/processed/irs-state-outflow-migration-2019_20.csv", index=False)