# Midterm Project


#### Load our Python tools

In [269]:
# %load_ext lab_black

In [270]:
import pandas as pd
import altair as alt

In [271]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

#### Import statewide dataset

In [272]:
covid_data=pd.read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/cdph-state-cases-deaths.csv").sort_values('date')

#### Data are cumulative, so we need to extract how many new cases there were each day

In [273]:
covid_data['daily_new_cases'] = covid_data['confirmed_cases'].diff().fillna(covid_data['confirmed_cases'])

In [274]:
covid_data.head()

Unnamed: 0,date,confirmed_cases,reported_cases,probable_cases,reported_and_probable_cases,reported_deaths,daily_new_cases
775,2020-02-01,23,3,1,4,0,23.0
774,2020-02-02,30,6,1,7,0,7.0
773,2020-02-03,35,6,1,7,0,5.0
772,2020-02-04,35,6,2,8,0,0.0
771,2020-02-05,38,6,2,8,0,3.0


In [275]:
covid_data.tail()

Unnamed: 0,date,confirmed_cases,reported_cases,probable_cases,reported_and_probable_cases,reported_deaths,daily_new_cases
4,2022-03-13,8446109,8438328,594726,9033054,86794,858.0
3,2022-03-14,8448053,8439055,594962,9034017,86792,1944.0
2,2022-03-15,8449170,8442537,595018,9037555,86927,1117.0
1,2022-03-16,8449433,8445468,595018,9040486,87045,263.0
0,2022-03-17,8449433,8450009,595018,9045027,87194,0.0


#### Create a generic month column for aggregation

In [276]:
covid_data["month"]=pd.to_datetime(covid_data["date"]).dt.strftime("%Y-%m-01")

In [277]:
covid_data.head()

Unnamed: 0,date,confirmed_cases,reported_cases,probable_cases,reported_and_probable_cases,reported_deaths,daily_new_cases,month
775,2020-02-01,23,3,1,4,0,23.0,2020-02-01
774,2020-02-02,30,6,1,7,0,7.0,2020-02-01
773,2020-02-03,35,6,1,7,0,5.0,2020-02-01
772,2020-02-04,35,6,2,8,0,0.0,2020-02-01
771,2020-02-05,38,6,2,8,0,3.0,2020-02-01


#### Group by month to sum new cases by month

In [278]:
covid_group_months = covid_data.groupby(["month"]).agg({"daily_new_cases":"sum"}).reset_index()

#### How many new cases per month? 

In [279]:
covid_group_months.rename(columns={'daily_new_cases':'new_monthly_cases'}, inplace=True)

In [280]:
covid_group_months.head()

Unnamed: 0,month,new_monthly_cases
0,2020-02-01,466.0
1,2020-03-01,19258.0
2,2020-04-01,46543.0
3,2020-05-01,71156.0
4,2020-06-01,173945.0


In [281]:
covid_group_months["new_monthly_covid_cases_per_100k"]=((covid_group_months.new_monthly_cases/38000000)*100000).round(1)

In [282]:
covid_group_months

Unnamed: 0,month,new_monthly_cases,new_monthly_covid_cases_per_100k
0,2020-02-01,466.0,1.2
1,2020-03-01,19258.0,50.7
2,2020-04-01,46543.0,122.5
3,2020-05-01,71156.0,187.3
4,2020-06-01,173945.0,457.8
5,2020-07-01,280811.0,739.0
6,2020-08-01,149061.0,392.3
7,2020-09-01,98756.0,259.9
8,2020-10-01,120593.0,317.4
9,2020-11-01,399992.0,1052.6


#### Now that you have Altair!

In [283]:
#### Omicron pops!

In [284]:
alt.Chart(covid_group_months).mark_bar(size=10).encode(
    x=alt.X('month:T', axis=alt.Axis(format="%b. %y")),
    y='new_monthly_cases:Q'
).properties(title='New CA COVID cases by month')

---

#### Clean flight dataset

In [285]:
flight_data=pd.read_csv("data/Los_Angeles_International_Airport_-_Flight_Operations_By_Month.csv", dtype = {"DataExtractDate":str})

In [286]:
flight_data["clean_date"]= pd.to_datetime(flight_data["ReportPeriod"]).dt.strftime("%Y-%m-%d")

In [287]:
flight_data.head()

Unnamed: 0,DataExtractDate,ReportPeriod,FlightType,Arrival_Departure,Domestic_International,FlightOpsCount,clean_date
0,04/01/2021 09:37:48 AM,02/01/2021 12:00:00 AM,Scheduled,Departure,International,1690,2021-02-01
1,04/18/2018 03:12:41 PM,12/01/2017 12:00:00 AM,Scheduled,Departure,Domestic,20797,2017-12-01
2,04/18/2018 03:12:41 PM,10/01/2008 12:00:00 AM,Charter,Departure,International,9,2008-10-01
3,04/18/2018 03:12:41 PM,02/01/2016 12:00:00 AM,Scheduled,Arrival,Domestic,18427,2016-02-01
4,04/18/2018 03:12:41 PM,09/01/2007 12:00:00 AM,Scheduled,Departure,International,4227,2007-09-01


In [288]:
flight_data[flight_data["clean_date"]=="2018-04-17"]

Unnamed: 0,DataExtractDate,ReportPeriod,FlightType,Arrival_Departure,Domestic_International,FlightOpsCount,clean_date


#### Create new data frame by using groupby to count the total amount of arrival/departure flights for each month

In [289]:
flight_arrival_departure =  flight_data.groupby(["clean_date", "Arrival_Departure"])["FlightOpsCount"].sum().reset_index()

In [290]:
flight_arrival_departure_type = flight_arrival_departure.pivot_table(columns="Arrival_Departure",  index="clean_date", values="FlightOpsCount").reset_index()

In [291]:
flight_arrival_departure_type.head()

Arrival_Departure,clean_date,Arrival,Departure
0,2006-01-01,24104,23844
1,2006-02-01,21365,21394
2,2006-03-01,24180,24222
3,2006-04-01,23279,23327
4,2006-05-01,24255,24279


#### Create data frame by using groupby to count the total amount of domestic/international flights for each month

In [292]:
flight_domestic_international_group =  flight_data.groupby(["clean_date", "Domestic_International"])["FlightOpsCount"].sum().reset_index()

In [293]:
flight_domestic_international_type = flight_domestic_international_group.pivot_table(columns="Domestic_International",  index="clean_date", values="FlightOpsCount").reset_index()

In [294]:
flight_domestic_international_type["Total"] = flight_domestic_international_type["Domestic"] + flight_domestic_international_type["International"]

In [295]:
flight_domestic_international_type.head()

Domestic_International,clean_date,Domestic,International,Total
0,2006-01-01,39471,8477,47948
1,2006-02-01,35723,7036,42759
2,2006-03-01,40419,7983,48402
3,2006-04-01,38761,7845,46606
4,2006-05-01,40492,8042,48534


#### Merge data frame of covid dataset with # of arrival/departure flights of each month

In [296]:
covid_flight_arrival_domestic=pd.merge(covid_group_months, flight_arrival_departure_type, left_on = "month", right_on="clean_date")

In [297]:
covid_flight_arrival_domestic.head()

Unnamed: 0,month,new_monthly_cases,new_monthly_covid_cases_per_100k,clean_date,Arrival,Departure
0,2020-02-01,466.0,1.2,2020-02-01,23152,23174
1,2020-03-01,19258.0,50.7,2020-03-01,21049,20990
2,2020-04-01,46543.0,122.5,2020-04-01,5541,5560
3,2020-05-01,71156.0,187.3,2020-05-01,4677,4683
4,2020-06-01,173945.0,457.8,2020-06-01,6186,6314


#### Merge the covid+arrival/departure flights data frame with the domestic/international flight data frame

In [298]:
covid_flight_merge=pd.merge(covid_flight_arrival_domestic, flight_domestic_international_type, on= "clean_date")

In [299]:
covid_flight_merge.head()

Unnamed: 0,month,new_monthly_cases,new_monthly_covid_cases_per_100k,clean_date,Arrival,Departure,Domestic,International,Total
0,2020-02-01,466.0,1.2,2020-02-01,23152,23174,38039,8287,46326
1,2020-03-01,19258.0,50.7,2020-03-01,21049,20990,35547,6492,42039
2,2020-04-01,46543.0,122.5,2020-04-01,5541,5560,9903,1198,11101
3,2020-05-01,71156.0,187.3,2020-05-01,4677,4683,8240,1120,9360
4,2020-06-01,173945.0,457.8,2020-06-01,6186,6314,11184,1316,12500


---

#### How many and which unique months have a level 1 travel health notice level? (covid rate less than 50 cases per 100k)

In [300]:
min_covid = covid_flight_merge[covid_flight_merge["new_monthly_covid_cases_per_100k"] <= 50]

In [301]:
len(min_covid)

1

In [302]:
min_covid

Unnamed: 0,month,new_monthly_cases,new_monthly_covid_cases_per_100k,clean_date,Arrival,Departure,Domestic,International,Total
0,2020-02-01,466.0,1.2,2020-02-01,23152,23174,38039,8287,46326


#### How many and which unique months have a level 4 travel health notice level? (covid rate more than 500 cases per 100k)

In [303]:
max_covid = covid_flight_merge[covid_flight_merge["new_monthly_covid_cases_per_100k"] >= 500]

In [304]:
len(max_covid)

7

In [305]:
max_covid

Unnamed: 0,month,new_monthly_cases,new_monthly_covid_cases_per_100k,clean_date,Arrival,Departure,Domestic,International,Total
5,2020-07-01,280811.0,739.0,2020-07-01,10071,10058,18047,2082,20129
9,2020-11-01,399992.0,1052.6,2020-11-01,11548,11530,19796,3282,23078
10,2020-12-01,1163609.0,3062.1,2020-12-01,12681,12645,21221,4105,25326
11,2021-01-01,834493.0,2196.0,2021-01-01,12377,12466,20285,4558,24843
18,2021-08-01,387630.0,1020.1,2021-08-01,20957,21052,35818,6191,42009
19,2021-09-01,231502.0,609.2,2021-09-01,19471,19441,33353,5559,38912
22,2021-12-01,715175.0,1882.0,2021-12-01,19559,19549,32070,7038,39108


#### What month had the lowest covid rate per 100K?

In [306]:
covid_flight_merge.sort_values("new_monthly_covid_cases_per_100k", ascending=True).head()

Unnamed: 0,month,new_monthly_cases,new_monthly_covid_cases_per_100k,clean_date,Arrival,Departure,Domestic,International,Total
0,2020-02-01,466.0,1.2,2020-02-01,23152,23174,38039,8287,46326
1,2020-03-01,19258.0,50.7,2020-03-01,21049,20990,35547,6492,42039
16,2021-06-01,33002.0,86.8,2021-06-01,19189,19133,33101,5221,38322
15,2021-05-01,34990.0,92.1,2021-05-01,16795,16884,28983,4696,33679
2,2020-04-01,46543.0,122.5,2020-04-01,5541,5560,9903,1198,11101


#### What month had the highest covid rate per 100k?

In [307]:
covid_flight_merge.sort_values("new_monthly_covid_cases_per_100k", ascending=False).head()

Unnamed: 0,month,new_monthly_cases,new_monthly_covid_cases_per_100k,clean_date,Arrival,Departure,Domestic,International,Total
10,2020-12-01,1163609.0,3062.1,2020-12-01,12681,12645,21221,4105,25326
11,2021-01-01,834493.0,2196.0,2021-01-01,12377,12466,20285,4558,24843
22,2021-12-01,715175.0,1882.0,2021-12-01,19559,19549,32070,7038,39108
9,2020-11-01,399992.0,1052.6,2020-11-01,11548,11530,19796,3282,23078
18,2021-08-01,387630.0,1020.1,2021-08-01,20957,21052,35818,6191,42009


#### When and how many covid cases per 100k when international flight operations were at its lowest?

In [308]:
covid_flight_merge[covid_flight_merge["International"] == covid_flight_merge["International"].min()]

Unnamed: 0,month,new_monthly_cases,new_monthly_covid_cases_per_100k,clean_date,Arrival,Departure,Domestic,International,Total
3,2020-05-01,71156.0,187.3,2020-05-01,4677,4683,8240,1120,9360


#### When and how many covid cases per 100k when all flight operations were at its highest?

In [309]:
covid_flight_merge[covid_flight_merge["Total"] == covid_flight_merge["Total"].max()]

Unnamed: 0,month,new_monthly_cases,new_monthly_covid_cases_per_100k,clean_date,Arrival,Departure,Domestic,International,Total
0,2020-02-01,466.0,1.2,2020-02-01,23152,23174,38039,8287,46326


#### Correlation between covid rate and domestic/international flights in 2020/2021?

In [318]:
covid_2020=covid_flight_merge[covid_flight_merge["month"].str.contains("2020")] 

In [319]:
merge_slim_2020 =covid_2020[["month", "new_monthly_covid_cases_per_100k", "Domestic", "International"]]

In [320]:
merge_slim_2020.corr(method="pearson")

Unnamed: 0,new_monthly_covid_cases_per_100k,Domestic,International
new_monthly_covid_cases_per_100k,1.0,-0.053145,-0.01082
Domestic,-0.053145,1.0,0.973403
International,-0.01082,0.973403,1.0


In [321]:
### No significant correlation between covid rate and flights in 2020

In [322]:
covid_2021=covid_flight_merge[covid_flight_merge["month"].str.contains("2021")] 

In [323]:
merge_slim_2021 =covid_2021[["month", "new_monthly_covid_cases_per_100k", "Domestic", "International"]]

In [324]:
merge_slim_2021.corr(method="pearson")

Unnamed: 0,new_monthly_covid_cases_per_100k,Domestic,International
new_monthly_covid_cases_per_100k,1.0,-0.126329,0.346053
Domestic,-0.126329,1.0,0.838449
International,0.346053,0.838449,1.0


In [325]:
### Slightly stronger correlation between covid rate and domestic/international flights in 2021