### Lingjun Guo Final Project

In [1]:
import numpy as np
import pandas as pd

%load_ext lab_black

#### Import Vaccine Data

In [2]:
src = pd.read_csv(
    "https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/cdph-vaccination-zipcode-totals.csv",
    dtype={"zip": str},
)

In [3]:
len(src)

75680

### Remove any probablematic values by creating a new dataframe where none of the records have nan/inf values

In [4]:
df_clean = src[~src.isin([np.nan, np.inf, -np.inf]).any(1)].copy()

### Only a few rows were causing that trouble

In [5]:
len(df_clean)

75637

### Let's just limit to LA explicitly from the top if we only care about LA

In [6]:
df = df_clean[df_clean["county"] == "Los Angeles"].copy()

In [7]:
len(df)

13330

In [8]:
df.dtypes

date                             object
id                                int64
county                           object
fips                              int64
population                      float64
partially_vaccinated              int64
at_least_one_dose                 int64
fully_vaccinated                  int64
partially_vaccinated_percent    float64
at_least_one_dose_percent       float64
fully_vaccinated_percent        float64
dtype: object

In [9]:
df["fully_vaccinated_percent"] = (df["fully_vaccinated_percent"] * 100).astype(int)
df["partially_vaccinated_percent"] = (df["partially_vaccinated_percent"] * 100).astype(
    int
)
df["at_least_one_dose_percent"] = (df["at_least_one_dose_percent"] * 100).astype(int)

In [10]:
df = df[df["fully_vaccinated_percent"] < 100].copy()

In [11]:
len(df)

13255

In [12]:
df.tail()

Unnamed: 0,date,id,county,fips,population,partially_vaccinated,at_least_one_dose,fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent
74842,2021-12-07,93550,Los Angeles,37,77006.0,5058,47372,42314,7,62,55
74843,2021-12-07,93551,Los Angeles,37,51085.0,3001,35099,32098,6,69,63
74844,2021-12-07,93552,Los Angeles,37,40266.0,2747,27054,24307,7,67,60
74845,2021-12-07,93553,Los Angeles,37,1781.0,84,965,881,5,54,49
74850,2021-12-07,93591,Los Angeles,37,6579.0,414,3772,3358,6,56,51


### Import Demographic Data

In [13]:
data = pd.read_csv("./demographics .csv")

In [14]:
data.head()

Unnamed: 0,objectid,zip,po_name,pop2012,white,black,ameri_es,asian,hawn_pi,hispanic,other,mult_race,med_age,ave_hh_sz,marhh_chd,families,ave_fam_sz,hse_units,sqmi
0,4800,94002,Belmont,26832,19634,477,84,5762,221,3517,1170,1744,41.0,2.42,2925,7671,2.98,12304,5.9
1,4801,94010,Burlingame,41102,31610,482,93,10244,188,5133,1865,2166,42.6,2.43,4798,12179,3.06,20020,13.0
2,4900,93442,Morro Bay,10873,9395,47,104,281,10,1602,645,322,48.9,2.09,593,2749,2.7,6609,43.2
3,4901,93445,Oceano,7633,4683,60,114,155,7,3389,1477,283,34.7,2.86,541,1582,3.42,2871,2.0
4,4802,94015,Daly City,61575,14279,2138,168,35940,482,9775,4520,2939,39.0,3.1,4637,13991,3.54,20103,5.7


In [15]:
data.describe()

Unnamed: 0,objectid,zip,pop2012,white,black,ameri_es,asian,hawn_pi,hispanic,other,mult_race,med_age,ave_hh_sz,marhh_chd,families,ave_fam_sz,hse_units,sqmi
count,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0,1704.0
mean,5645.5,91850.103286,22114.285798,12562.607394,1344.949531,208.768779,2846.052817,80.890845,8210.019366,3698.616197,1060.163732,33.554871,-1.046408,1859.199531,5060.869131,-0.564173,8007.593897,92.571185
std,492.046746,12832.547024,21698.306405,12143.404948,2909.9336,265.243073,5172.476992,186.545179,12851.745586,6174.87805,1126.0236,27.735537,19.677049,1988.062816,4979.046546,19.928976,7411.551056,244.416079
min,4794.0,12.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.1
25%,5219.75,92123.75,1984.75,1346.5,17.0,27.0,26.0,2.0,293.25,92.0,76.0,32.175,2.27,119.0,427.25,2.82,928.0,5.3
50%,5645.5,93610.5,17365.5,9535.0,313.0,121.0,745.0,25.0,2904.5,1026.5,724.5,38.05,2.625,1306.0,3864.0,3.14,6449.0,17.05
75%,6071.25,95345.25,36305.0,20774.25,1321.5,305.0,3310.25,88.0,10678.25,4584.25,1785.0,44.8,3.1,3023.75,8351.5,3.54,13473.5,87.4
max,6497.0,96162.0,107268.0,71406.0,38816.0,2565.0,43045.0,2705.0,115824.0,56308.0,6559.0,74.7,100.5,13191.0,27802.0,146.0,37761.0,3785.6


#### Clean Up

In [16]:
df.head()

Unnamed: 0,date,id,county,fips,population,partially_vaccinated,at_least_one_dose,fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent
0,2021-01-05,90004,Los Angeles,37,61586.0,913,932,19,1,2,0
1,2021-01-05,90005,Los Angeles,37,39479.0,425,444,19,1,1,0
2,2021-01-05,90008,Los Angeles,37,31739.0,304,317,13,1,1,0
3,2021-01-05,90011,Los Angeles,37,109414.0,538,550,12,0,1,0
4,2021-01-05,90012,Los Angeles,37,35913.0,537,555,18,1,2,0


In [17]:
df.tail()

Unnamed: 0,date,id,county,fips,population,partially_vaccinated,at_least_one_dose,fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent
74842,2021-12-07,93550,Los Angeles,37,77006.0,5058,47372,42314,7,62,55
74843,2021-12-07,93551,Los Angeles,37,51085.0,3001,35099,32098,6,69,63
74844,2021-12-07,93552,Los Angeles,37,40266.0,2747,27054,24307,7,67,60
74845,2021-12-07,93553,Los Angeles,37,1781.0,84,965,881,5,54,49
74850,2021-12-07,93591,Los Angeles,37,6579.0,414,3772,3358,6,56,51


In [18]:
df.county.value_counts().reset_index()

Unnamed: 0,index,county
0,Los Angeles,13255


#### Compare the latest data with back then April

In [19]:
df["nudate"] = pd.to_datetime(df["date"])

In [20]:
vacc_recent = df[df["date"] == df["date"].max()]

In [21]:
vacc_early = df[df["date"] == "2021-04-20"]

In [22]:
vacc_recent[
    (vacc_recent.county == "Los Angeles")
    & (vacc_recent["fully_vaccinated_percent"] > 1)
].sort_values("fully_vaccinated_percent", ascending=False).head()

Unnamed: 0,date,id,county,fips,population,partially_vaccinated,at_least_one_dose,fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent,nudate
74097,2021-12-07,90021,Los Angeles,37,2922.0,637,3431,2794,22,117,96,2021-12-07
74181,2021-12-07,90401,Los Angeles,37,7256.0,1122,7916,6794,15,109,94,2021-12-07
74120,2021-12-07,90045,Los Angeles,37,41149.0,4546,43213,38667,11,105,94,2021-12-07
74255,2021-12-07,91105,Los Angeles,37,12191.0,999,11934,10935,8,98,90,2021-12-07
74149,2021-12-07,90232,Los Angeles,37,14660.0,1265,14405,13140,9,98,90,2021-12-07


In [23]:
vacc_early[(vacc_early["fully_vaccinated_percent"] > 1)].sort_values(
    "fully_vaccinated_percent", ascending=False
).head(5)

Unnamed: 0,date,id,county,fips,population,partially_vaccinated,at_least_one_dose,fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent,nudate
21447,2021-04-20,90067,Los Angeles,37,2314.0,562,2380,1818,24,103,79,2021-04-20
21568,2021-04-20,91105,Los Angeles,37,12191.0,2458,9113,6655,20,75,55,2021-04-20
21564,2021-04-20,91046,Los Angeles,37,143.0,26,105,79,18,73,55,2021-04-20
21454,2021-04-20,90210,Los Angeles,37,19909.0,2892,13196,10304,15,66,52,2021-04-20
21624,2021-04-20,91436,Los Angeles,37,15070.0,2093,9802,7709,14,65,51,2021-04-20


In [24]:
vacc_early.groupby(["id"]).agg(
    {"fully_vaccinated_percent": "mean"}
).reset_index().sort_values("fully_vaccinated_percent", ascending=False).head(15)

Unnamed: 0,id,fully_vaccinated_percent
56,90067,79
166,91105,55
162,91046,55
62,90210,52
217,91436,51
59,90077,49
102,90402,48
86,90272,48
53,90064,47
58,90069,47


In [25]:
vacc_recent.groupby(["id"]).agg(
    {"fully_vaccinated_percent": "mean"}
).reset_index().sort_values("fully_vaccinated_percent", ascending=False).head(15)

Unnamed: 0,id,fully_vaccinated_percent
18,90021,96
99,90401,94
41,90045,94
163,91105,90
67,90232,90
52,90064,86
242,91754,86
92,90292,85
71,90245,82
60,90210,81


#### Merge april vax with the demographic data

In [26]:
April = pd.merge(data, vacc_early, left_on="zip", right_on="id", how="right",)

In [27]:
April.drop(["id"], axis=1, inplace=True)

In [28]:
len(April)

275

In [29]:
April.head()

Unnamed: 0,objectid,zip,po_name,pop2012,white,black,ameri_es,asian,hawn_pi,hispanic,...,county,fips,population,partially_vaccinated,at_least_one_dose,fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent,nudate
0,5965.0,90001.0,Los Angeles,57464.0,20670.0,6077.0,413.0,147.0,30.0,51020.0,...,Los Angeles,37,58975.0,15996,29856,13860,27,51,24,2021-04-20
1,5966.0,90002.0,Los Angeles,52005.0,14211.0,12809.0,360.0,140.0,32.0,36703.0,...,Los Angeles,37,53111.0,6145,14928,8783,12,28,17,2021-04-20
2,5967.0,90003.0,Los Angeles,66202.0,19512.0,15976.0,517.0,153.0,56.0,48864.0,...,Los Angeles,37,72741.0,8398,20048,11650,12,28,16,2021-04-20
3,5843.0,90004.0,Los Angeles,62549.0,23006.0,2197.0,489.0,16411.0,50.0,31237.0,...,Los Angeles,37,61586.0,10070,28512,18442,16,46,30,2021-04-20
4,5844.0,90005.0,Los Angeles,40852.0,12021.0,2126.0,337.0,15081.0,40.0,20497.0,...,Los Angeles,37,39479.0,6858,16921,10063,17,43,25,2021-04-20


In [30]:
LA_April = (
    April[April["county"] == "Los Angeles"]
    .sort_values("fully_vaccinated_percent", ascending=False)
    .head(5)
)
LA_April

Unnamed: 0,objectid,zip,po_name,pop2012,white,black,ameri_es,asian,hawn_pi,hispanic,...,county,fips,population,partially_vaccinated,at_least_one_dose,fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent,nudate
56,5864.0,90067.0,Los Angeles,2471.0,2204.0,26.0,2.0,265.0,2.0,96.0,...,Los Angeles,37,2314.0,562,2380,1818,24,103,79,2021-04-20
166,5954.0,91105.0,Pasadena,11771.0,8993.0,536.0,37.0,1706.0,7.0,1919.0,...,Los Angeles,37,12191.0,2458,9113,6655,20,75,55,2021-04-20
162,,,,,,,,,,,...,Los Angeles,37,143.0,26,105,79,18,73,55,2021-04-20
62,5870.0,90210.0,Beverly Hills,21265.0,18965.0,398.0,19.0,1239.0,11.0,1126.0,...,Los Angeles,37,19909.0,2892,13196,10304,15,66,52,2021-04-20
217,5917.0,91436.0,Encino,14759.0,13695.0,346.0,18.0,844.0,12.0,1014.0,...,Los Angeles,37,15070.0,2093,9802,7709,14,65,51,2021-04-20


#### Merge latest vax with the demographic data

In [31]:
recent = pd.merge(data, vacc_recent, left_on="zip", right_on="id", how="right",)

In [32]:
recent.drop(["id"], axis=1, inplace=True)

In [33]:
len(recent)

271

In [34]:
LA_recent = recent[recent["county"] == "Los Angeles"].sort_values(
    "fully_vaccinated_percent", ascending=False
)
LA_recent.head()

Unnamed: 0,objectid,zip,po_name,pop2012,white,black,ameri_es,asian,hawn_pi,hispanic,...,county,fips,population,partially_vaccinated,at_least_one_dose,fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent,nudate
18,5995,90021,Los Angeles,2741,950,506,22,114,2,911,...,Los Angeles,37,2922.0,637,3431,2794,22,117,96,2021-12-07
99,5839,90401,Santa Monica,6775,6082,433,34,791,10,813,...,Los Angeles,37,7256.0,1122,7916,6794,15,109,94,2021-12-07
41,5879,90045,Los Angeles,39026,25512,5738,161,4929,144,7641,...,Los Angeles,37,41149.0,4546,43213,38667,11,105,94,2021-12-07
163,5954,91105,Pasadena,11771,8993,536,37,1706,7,1919,...,Los Angeles,37,12191.0,999,11934,10935,8,98,90,2021-12-07
67,5874,90232,Culver City,16708,9442,934,75,2335,42,3959,...,Los Angeles,37,14660.0,1265,14405,13140,9,98,90,2021-12-07


In [35]:
LA_recent["white_percent"] = LA_recent["white"] / LA_recent["population"]
LA_recent["black_percent"] = LA_recent["black"] / LA_recent["population"]
LA_recent["hispanic_percent"] = LA_recent["hispanic"] / LA_recent["population"]
LA_recent["asian_percent"] = LA_recent["asian"] / LA_recent["population"]

### There doesn't seem to be a string relationship

In [36]:
corr = LA_recent[
    [
        "zip",
        "white_percent",
        "black_percent",
        "hispanic_percent",
        "asian_percent",
        "fully_vaccinated_percent",
    ]
].corr(method="pearson")

In [37]:
corr

Unnamed: 0,zip,white_percent,black_percent,hispanic_percent,asian_percent,fully_vaccinated_percent
zip,1.0,0.151704,-0.203632,-0.062629,0.091792,-0.255485
white_percent,0.151704,1.0,-0.486708,-0.382555,-0.270401,0.210322
black_percent,-0.203632,-0.486708,1.0,0.018859,-0.231183,-0.19217
hispanic_percent,-0.062629,-0.382555,0.018859,1.0,-0.264849,-0.420848
asian_percent,0.091792,-0.270401,-0.231183,-0.264849,1.0,0.391493
fully_vaccinated_percent,-0.255485,0.210322,-0.19217,-0.420848,0.391493,1.0


In [38]:
# !pip install altair
import altair as alt

alt.Chart(
    LA_recent[
        [
            "zip",
            "white_percent",
            "black_percent",
            "hispanic_percent",
            "asian_percent",
            "fully_vaccinated_percent",
        ]
    ]
).mark_circle(size=60).encode(
    x="hispanic_percent", y="fully_vaccinated_percent",
).interactive()

#### Look at the place I live (90015)

In [39]:
vacc_recent[vacc_recent.id == 90015]

Unnamed: 0,date,id,county,fips,population,partially_vaccinated,at_least_one_dose,fully_vaccinated,partially_vaccinated_percent,at_least_one_dose_percent,fully_vaccinated_percent,nudate
74091,2021-12-07,90015,Los Angeles,37,21801.0,2937,19719,16782,13,90,77,2021-12-07
