# Week 6: Merging data with Pandas, Part 1
What's the relationship between vaccination rates and votes in the recall election [using the `merge()` method](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html)

#### Load our Python tools

In [37]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [38]:
import pandas as pd
import altair as alt

In [39]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

### Vax percentage

#### LA Times [coronavirus repo](https://github.com/datadesk/california-coronavirus-data)

In [40]:
vax_url = "https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/cdph-vaccination-county-totals.csv"

#### Read weekly vaccination totals by county data (data types can be defined here)

In [41]:
vax_df = pd.read_csv(vax_url, dtype={"fips": str})

#### How many weeks is this (don't forget to sort)? 

In [42]:
len(vax_df)

44892

In [43]:
vax_df.head()

Unnamed: 0,date,county,fips,population,doses_administered,new_doses_administered,pfizer_doses,new_pfizer_doses,moderna_doses,new_moderna_doses,jj_doses,new_jj_doses,partially_vaccinated,new_partially_vaccinated,at_least_one_dose,new_at_least_one_dose,fully_vaccinated,new_fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent
0,2022-02-16,Alameda,1,1643700.0,3489536,2748,2129666.0,1576,1061910.0,574,133454.0,32,107927.0,438,1447947.0,447,1340020.0,729,0.065661,0.880907,0.815246
1,2022-02-16,Alpine,3,1148.0,1864,0,73.0,0,1769.0,0,9.0,0,138.0,0,856.0,0,718.0,0,0.120209,0.745645,0.625436
2,2022-02-16,Amador,5,37829.0,53948,23,15772.0,12,35722.0,8,1910.0,0,3930.0,7,24466.0,7,20536.0,2,0.103889,0.646752,0.542864
3,2022-02-16,Butte,7,227075.0,287105,209,141253.0,114,130049.0,80,10199.0,2,10724.0,18,124934.0,18,114210.0,33,0.047227,0.550188,0.502962
4,2022-02-16,Calaveras,9,45235.0,63074,19,26888.0,10,33873.0,9,1815.0,0,4403.0,4,28497.0,4,24094.0,5,0.097336,0.629977,0.532641


In [44]:
vax_df["date"].min()

'2020-01-05'

In [45]:
vax_df["date"].max()

'2022-02-16'

#### Select the most recent week

In [46]:
most_recent = vax_df[vax_df["date"] == vax_df["date"].max()]

In [47]:
len(most_recent)

58

#### Get only the columns we need

In [84]:
vax_slim = most_recent[["county", "fips", "fully_vaccinated_percent", "population"]]

In [83]:
vax_slim.head()

Unnamed: 0,county,fips,fully_vaccinated_percent,population
0,Alameda,1,0.815246,1643700.0
1,Alpine,3,0.625436,1148.0
2,Amador,5,0.542864,37829.0
3,Butte,7,0.502962,227075.0
4,Calaveras,9,0.532641,45235.0


---

### Recall vote

#### Read the county-level election results

In [50]:
recall_df = pd.read_json(
    "../data/raw/gov_race_change_counties.json", dtype={"fips": str}
)

In [51]:
recall_df.head()

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,d_pct_21,r_pct_18,r_pct_21,d_change,r_change,leader_2021,leader_2018
0,Alameda,1,462558,287014,111677,60716,574235,347730,80.6,82.5,19.4,17.5,1.9,-1.9,D,D
1,Alpine,3,386,340,229,218,615,558,62.8,60.9,37.2,39.1,-1.9,1.9,D,D
2,Amador,5,6237,6411,11356,10971,17593,17382,35.5,36.9,64.5,63.1,1.4,-1.4,R,R
3,Butte,7,41500,31774,47226,35182,88726,66956,46.8,47.5,53.2,52.5,0.7,-0.7,R,R
4,Calaveras,9,7765,7106,13845,11450,21610,18556,35.9,38.3,64.1,61.7,2.4,-2.4,R,R


#### Rename columns

In [52]:
recall_df = recall_df.rename(
    columns={
        "d_pct_21": "no_percent",
        "r_pct_21": "yes_percent",
        "leader_2021": "winner_2021",
    },
    inplace=True,
)

#### What's that look like? 

In [55]:
recall_df.head()

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,no_percent,r_pct_18,yes_percent,d_change,r_change,winner_2021,leader_2018
0,Alameda,1,462558,287014,111677,60716,574235,347730,80.6,82.5,19.4,17.5,1.9,-1.9,D,D
1,Alpine,3,386,340,229,218,615,558,62.8,60.9,37.2,39.1,-1.9,1.9,D,D
2,Amador,5,6237,6411,11356,10971,17593,17382,35.5,36.9,64.5,63.1,1.4,-1.4,R,R
3,Butte,7,41500,31774,47226,35182,88726,66956,46.8,47.5,53.2,52.5,0.7,-0.7,R,R
4,Calaveras,9,7765,7106,13845,11450,21610,18556,35.9,38.3,64.1,61.7,2.4,-2.4,R,R


#### Which county was most against the recall?

In [60]:
recall_df.sort_values("no_percent", ascending=False).head()

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,no_percent,r_pct_18,yes_percent,d_change,r_change,winner_2021,leader_2018
37,San Francisco,75,312181,231861,49181,35571,361362,267432,86.4,86.7,13.6,13.3,0.3,-0.3,D,D
20,Marin,41,103671,79616,26750,14873,130421,94489,79.5,84.3,20.5,15.7,4.8,-4.8,D,D
0,Alameda,1,462558,287014,111677,60716,574235,347730,80.6,82.5,19.4,17.5,1.9,-1.9,D,D
43,Santa Cruz,87,91523,60654,27665,15004,119188,75658,76.8,80.2,23.2,19.8,3.4,-3.4,D,D
40,San Mateo,81,213282,174757,70242,46076,283524,220833,75.2,79.1,24.8,20.9,3.9,-3.9,D,D


In [61]:
recall_df[recall_df["no_percent"] == recall_df["no_percent"].max()]

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,no_percent,r_pct_18,yes_percent,d_change,r_change,winner_2021,leader_2018
37,San Francisco,75,312181,231861,49181,35571,361362,267432,86.4,86.7,13.6,13.3,0.3,-0.3,D,D


#### For it? 

In [62]:
recall_df.sort_values("no_percent", ascending=True).head()

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,no_percent,r_pct_18,yes_percent,d_change,r_change,winner_2021,leader_2018
17,Lassen,35,2043,1357,6973,6590,9016,7947,22.7,17.1,77.3,82.9,-5.6,5.6,R,R
24,Modoc,49,820,706,2628,2505,3448,3211,23.8,22.0,76.2,78.0,-1.8,1.8,R,R
51,Tehama,103,5756,2035,15137,5721,20893,7756,27.5,26.2,72.5,73.8,-1.3,1.3,R,R
10,Glenn,21,2424,1941,5908,4393,8332,6334,29.1,30.6,70.9,69.4,1.5,-1.5,R,R
44,Shasta,89,20256,15726,49825,30932,70081,46658,28.9,33.7,71.1,66.3,4.8,-4.8,R,R


In [63]:
recall_df[recall_df["yes_percent"] == recall_df["yes_percent"].max()]

Unnamed: 0,county,fips,dem_2018,dem_2021,rep_2018,rep_2021,votes_2018,votes_2021,d_pct_18,no_percent,r_pct_18,yes_percent,d_change,r_change,winner_2021,leader_2018
17,Lassen,35,2043,1357,6973,6590,9016,7947,22.7,17.1,77.3,82.9,-5.6,5.6,R,R


#### Get only the columns we need

In [71]:
recall_df_slim = recall_df[
    ["county", "fips", "no_percent", "yes_percent", "winner_2021"]
]

In [72]:
recall_df_slim.head()

Unnamed: 0,county,fips,no_percent,yes_percent,winner_2021
0,Alameda,1,82.5,17.5,D
1,Alpine,3,60.9,39.1,D
2,Amador,5,36.9,63.1,R
3,Butte,7,47.5,52.5,R
4,Calaveras,9,38.3,61.7,R


---

## Merge the vax data with the election results

#### Use the `merge()` method

In [80]:
merge_df = pd.merge(vax_slim, recall_df_slim, on=["fips", "county"])

In [81]:
merge_df.head()

Unnamed: 0,county,fips,fully_vaccinated_percent,population,no_percent,yes_percent,winner_2021
0,Alameda,1,0.815246,1643700.0,82.5,17.5,D
1,Alpine,3,0.625436,1148.0,60.9,39.1,D
2,Amador,5,0.542864,37829.0,36.9,63.1,R
3,Butte,7,0.502962,227075.0,47.5,52.5,R
4,Calaveras,9,0.532641,45235.0,38.3,61.7,R


#### Correlation? 

---

In [6]:
# domain = ["R", "D"]
# range_ = ["#d94f54", "#5789b8"]

# alt.Chart(merged).mark_circle(size=100).encode(
#     x=alt.X(
#         "no_percent",
#         scale=alt.Scale(domain=[10, 100]),
#         axis=alt.Axis(tickCount=6, title="No percentage"),
#     ),
#     y=alt.Y(
#         "fully_vaccinated_percent",
#         scale=alt.Scale(domain=[0.2, 1]),
#         axis=alt.Axis(tickCount=6, title="One dose percentage", format="%"),
#     ),
#     #     size="population",
#     color=alt.Color("winner_2021", scale=alt.Scale(domain=domain, range=range_)),
#     #     tooltip=["county", "population"],
# ).properties(width=500, height=500)