# Week 6: Merging data with Pandas, Part 1 - Test
What's the relationship between vaccination rates and votes in the recall election [using the `merge()` method](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html)

#### Load our Python tools

In [1]:
%load_ext lab_black

In [2]:
import altair as alt
import pandas as pd

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

### Vax percentage

#### LA Times [coronavirus repo](https://github.com/datadesk/california-coronavirus-data)

In [4]:
vax_url = "https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/cdph-vaccination-county-totals.csv"

#### Read weekly vaccination totals by county data (data types can be defined here)

In [5]:
vax_df = pd.read_csv(vax_url, dtype={"fips": str})

#### How many weeks is this (don't forget to sort)? 

In [6]:
vax_df.head()

Unnamed: 0,date,county,fips,population,doses_administered,new_doses_administered,pfizer_doses,new_pfizer_doses,moderna_doses,new_moderna_doses,jj_doses,new_jj_doses,partially_vaccinated,new_partially_vaccinated,at_least_one_dose,new_at_least_one_dose,fully_vaccinated,new_fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent
0,2022-02-21,Alameda,1,1643700.0,3504668,493,2138982.0,299,1065487.0,143,133636.0,13,107066.0,93,1450742.0,98,1343676.0,108,0.065137,0.882608,0.81747
1,2022-02-21,Alpine,3,1148.0,1865,0,73.0,0,1769.0,0,9.0,0,139.0,0,857.0,0,718.0,0,0.12108,0.746516,0.625436
2,2022-02-21,Amador,5,37829.0,54202,7,15852.0,3,35882.0,4,1912.0,0,3890.0,1,24511.0,1,20621.0,0,0.102831,0.647942,0.545111
3,2022-02-21,Butte,7,227075.0,288365,30,141977.0,15,130419.0,13,10241.0,0,10626.0,3,125162.0,3,114536.0,8,0.046795,0.551192,0.504397
4,2022-02-21,Calaveras,9,45235.0,63249,4,26952.0,0,33965.0,4,1824.0,0,4392.0,1,28541.0,1,24149.0,1,0.097093,0.630949,0.533857


In [27]:
vax_df["date"].min()

'2020-01-05'

In [28]:
vax_df["date"].max()

'2022-02-21'

In [29]:
len(vax_df)

45182

#### Select the most recent week

In [10]:
most_recent = vax_df[vax_df["date"] == vax_df["date"].max()]

In [11]:
len(most_recent)

58

#### Get only the columns we need

In [12]:
vax_slim = most_recent[["county", "fips", "fully_vaccinated_percent", "population"]]

In [13]:
vax_slim.head()

Unnamed: 0,county,fips,fully_vaccinated_percent,population
0,Alameda,1,0.81747,1643700.0
1,Alpine,3,0.625436,1148.0
2,Amador,5,0.545111,37829.0
3,Butte,7,0.504397,227075.0
4,Calaveras,9,0.533857,45235.0


---

### Recall vote

#### Read the county-level election results

In [14]:
recall_df = pd.read_json(
    "../../data/raw/gov_race_change_counties.json", dtype={"fips": str}
)

In [15]:
recall_df.head()

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,d_pct_21,r_pct_18,r_pct_21,d_change,r_change,leader_2021,leader_2018
0,Alameda,1,462558,287014,111677,60716,574235,347730,80.6,82.5,19.4,17.5,1.9,-1.9,D,D
1,Alpine,3,386,340,229,218,615,558,62.8,60.9,37.2,39.1,-1.9,1.9,D,D
2,Amador,5,6237,6411,11356,10971,17593,17382,35.5,36.9,64.5,63.1,1.4,-1.4,R,R
3,Butte,7,41500,31774,47226,35182,88726,66956,46.8,47.5,53.2,52.5,0.7,-0.7,R,R
4,Calaveras,9,7765,7106,13845,11450,21610,18556,35.9,38.3,64.1,61.7,2.4,-2.4,R,R


#### Rename columns

In [16]:
recall_df.rename(
    columns={
        "d_pct_21": "no_percent",
        "r_pct_21": "yes_percent",
        "leader_2021": "winner_2021",
    },
    inplace=True,
)

In [17]:
recall_df.head()

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,no_percent,r_pct_18,yes_percent,d_change,r_change,winner_2021,leader_2018
0,Alameda,1,462558,287014,111677,60716,574235,347730,80.6,82.5,19.4,17.5,1.9,-1.9,D,D
1,Alpine,3,386,340,229,218,615,558,62.8,60.9,37.2,39.1,-1.9,1.9,D,D
2,Amador,5,6237,6411,11356,10971,17593,17382,35.5,36.9,64.5,63.1,1.4,-1.4,R,R
3,Butte,7,41500,31774,47226,35182,88726,66956,46.8,47.5,53.2,52.5,0.7,-0.7,R,R
4,Calaveras,9,7765,7106,13845,11450,21610,18556,35.9,38.3,64.1,61.7,2.4,-2.4,R,R


#### What's that look like? 

In [18]:
recall_df.head()

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,no_percent,r_pct_18,yes_percent,d_change,r_change,winner_2021,leader_2018
0,Alameda,1,462558,287014,111677,60716,574235,347730,80.6,82.5,19.4,17.5,1.9,-1.9,D,D
1,Alpine,3,386,340,229,218,615,558,62.8,60.9,37.2,39.1,-1.9,1.9,D,D
2,Amador,5,6237,6411,11356,10971,17593,17382,35.5,36.9,64.5,63.1,1.4,-1.4,R,R
3,Butte,7,41500,31774,47226,35182,88726,66956,46.8,47.5,53.2,52.5,0.7,-0.7,R,R
4,Calaveras,9,7765,7106,13845,11450,21610,18556,35.9,38.3,64.1,61.7,2.4,-2.4,R,R


#### Which county was most against the recall?

In [19]:
recall_df.sort_values("no_percent", ascending=False).head()

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,no_percent,r_pct_18,yes_percent,d_change,r_change,winner_2021,leader_2018
37,San Francisco,75,312181,231861,49181,35571,361362,267432,86.4,86.7,13.6,13.3,0.3,-0.3,D,D
20,Marin,41,103671,79616,26750,14873,130421,94489,79.5,84.3,20.5,15.7,4.8,-4.8,D,D
0,Alameda,1,462558,287014,111677,60716,574235,347730,80.6,82.5,19.4,17.5,1.9,-1.9,D,D
43,Santa Cruz,87,91523,60654,27665,15004,119188,75658,76.8,80.2,23.2,19.8,3.4,-3.4,D,D
40,San Mateo,81,213282,174757,70242,46076,283524,220833,75.2,79.1,24.8,20.9,3.9,-3.9,D,D


In [20]:
recall_df.sort_values("yes_percent", ascending=False).head()

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,no_percent,r_pct_18,yes_percent,d_change,r_change,winner_2021,leader_2018
17,Lassen,35,2043,1357,6973,6590,9016,7947,22.7,17.1,77.3,82.9,-5.6,5.6,R,R
24,Modoc,49,820,706,2628,2505,3448,3211,23.8,22.0,76.2,78.0,-1.8,1.8,R,R
51,Tehama,103,5756,2035,15137,5721,20893,7756,27.5,26.2,72.5,73.8,-1.3,1.3,R,R
10,Glenn,21,2424,1941,5908,4393,8332,6334,29.1,30.6,70.9,69.4,1.5,-1.5,R,R
44,Shasta,89,20256,15726,49825,30932,70081,46658,28.9,33.7,71.1,66.3,4.8,-4.8,R,R


In [21]:
recall_df[recall_df["no_percent"] == recall_df["no_percent"].max()]

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,no_percent,r_pct_18,yes_percent,d_change,r_change,winner_2021,leader_2018
37,San Francisco,75,312181,231861,49181,35571,361362,267432,86.4,86.7,13.6,13.3,0.3,-0.3,D,D


#### For it? 

#### Get only the columns we need

In [22]:
recall_df_slim = recall_df[
    ["county", "fips", "no_percent", "yes_percent", "winner_2021"]
]

In [23]:
recall_df_slim.head()

Unnamed: 0,county,fips,no_percent,yes_percent,winner_2021
0,Alameda,1,82.5,17.5,D
1,Alpine,3,60.9,39.1,D
2,Amador,5,36.9,63.1,R
3,Butte,7,47.5,52.5,R
4,Calaveras,9,38.3,61.7,R


---

## Merge the vax data with the election results

#### Use the `merge()` method

In [24]:
merge_df = pd.merge(vax_slim, recall_df_slim, on="fips")

In [25]:
merge_df.head()

Unnamed: 0,county_x,fips,fully_vaccinated_percent,population,county_y,no_percent,yes_percent,winner_2021
0,Alameda,1,0.81747,1643700.0,Alameda,82.5,17.5,D
1,Alpine,3,0.625436,1148.0,Alpine,60.9,39.1,D
2,Amador,5,0.545111,37829.0,Amador,36.9,63.1,R
3,Butte,7,0.504397,227075.0,Butte,47.5,52.5,R
4,Calaveras,9,0.533857,45235.0,Calaveras,38.3,61.7,R


In [26]:
merge_df.rename(
    columns={"county_x": "county", "county_y": "county_other"}, inplace=True
)