In [11]:
# Dependencies
import pandas as pd
import datetime

In [12]:
# Store filepath in a variable
flu_data = "covid_ca_csv.csv"

In [13]:
# Read our Data file with the pandas library
# Not every CSV requires an encoding, but be aware this can come up
flu_data_df = pd.read_csv(flu_data)

In [14]:
# Show just the header
flu_data_df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,1/25/2020,Orange,California,6059.0,1,0
1,1/26/2020,Los Angeles,California,6037.0,1,0
2,1/26/2020,Orange,California,6059.0,1,0
3,1/27/2020,Los Angeles,California,6037.0,1,0
4,1/27/2020,Orange,California,6059.0,1,0


In [15]:
# Converting irregular date column to standard date format in Pandas
flu_data_df['Standard_Date']=pd.to_datetime(flu_data_df.date,errors='coerce')
flu_data_df

Unnamed: 0,date,county,state,fips,cases,deaths,Standard_Date
0,1/25/2020,Orange,California,6059.0,1,0,2020-01-25
1,1/26/2020,Los Angeles,California,6037.0,1,0,2020-01-26
2,1/26/2020,Orange,California,6059.0,1,0,2020-01-26
3,1/27/2020,Los Angeles,California,6037.0,1,0,2020-01-27
4,1/27/2020,Orange,California,6059.0,1,0,2020-01-27
...,...,...,...,...,...,...,...
6154,6/30/2020,Tulare,California,6107.0,4156,126,2020-06-30
6155,6/30/2020,Tuolumne,California,6109.0,30,0,2020-06-30
6156,6/30/2020,Ventura,California,6111.0,2926,46,2020-06-30
6157,6/30/2020,Yolo,California,6113.0,526,24,2020-06-30


In [16]:
# Removing irregular date from table - no longer required
flu_data_clean_df = flu_data_df.drop(columns=['date'])
flu_data_clean_df

Unnamed: 0,county,state,fips,cases,deaths,Standard_Date
0,Orange,California,6059.0,1,0,2020-01-25
1,Los Angeles,California,6037.0,1,0,2020-01-26
2,Orange,California,6059.0,1,0,2020-01-26
3,Los Angeles,California,6037.0,1,0,2020-01-27
4,Orange,California,6059.0,1,0,2020-01-27
...,...,...,...,...,...,...
6154,Tulare,California,6107.0,4156,126,2020-06-30
6155,Tuolumne,California,6109.0,30,0,2020-06-30
6156,Ventura,California,6111.0,2926,46,2020-06-30
6157,Yolo,California,6113.0,526,24,2020-06-30


In [17]:
# converting the standard date format to the week number in the year
flu_data_clean_df['week'] = flu_data_clean_df['Standard_Date'].dt.strftime('%U')
flu_data_clean_df

Unnamed: 0,county,state,fips,cases,deaths,Standard_Date,week
0,Orange,California,6059.0,1,0,2020-01-25,03
1,Los Angeles,California,6037.0,1,0,2020-01-26,04
2,Orange,California,6059.0,1,0,2020-01-26,04
3,Los Angeles,California,6037.0,1,0,2020-01-27,04
4,Orange,California,6059.0,1,0,2020-01-27,04
...,...,...,...,...,...,...,...
6154,Tulare,California,6107.0,4156,126,2020-06-30,26
6155,Tuolumne,California,6109.0,30,0,2020-06-30,26
6156,Ventura,California,6111.0,2926,46,2020-06-30,26
6157,Yolo,California,6113.0,526,24,2020-06-30,26


In [18]:
# streamlined table
flu_data_relevant_columns_df = flu_data_clean_df[['state', 'cases', 'deaths', 'week']]
flu_data_relevant_columns_df


Unnamed: 0,state,cases,deaths,week
0,California,1,0,03
1,California,1,0,04
2,California,1,0,04
3,California,1,0,04
4,California,1,0,04
...,...,...,...,...
6154,California,4156,126,26
6155,California,30,0,26
6156,California,2926,46,26
6157,California,526,24,26


In [20]:
# code to add up cases if week is same

flu_data_relevant_group_columns_df = flu_data_relevant_columns_df.groupby(["week"])

Total_cases_per_week_df = flu_data_relevant_group_columns_df["cases"].sum()

Total_cases_per_week_df


week
03          1
04         16
05         42
06         48
07         54
08        138
09        419
10       1618
11       6585
12      24459
13      69737
14     132124
15     188891
16     262315
17     340914
18     424961
19     512828
20     604519
21     714773
22     845819
23     985096
24    1143959
25    1370248
26     673103
Name: cases, dtype: int64

In [22]:
# code to add up deaths if week is same

Total_deaths_per_week_df = flu_data_relevant_group_columns_df["deaths"].sum()

Total_deaths_per_week_df

week
03        0
04        0
05        0
06        0
07        0
08        0
09        4
10       24
11      119
12      503
13     1527
14     3464
15     6231
16     9999
17    13778
18    17339
19    20990
20    24630
21    27818
22    30971
23    33962
24    36973
25    40027
26    17999
Name: deaths, dtype: int64

In [35]:
# cleaned COVID 19 table 

COVID_table_df = pd.DataFrame({"cases": Total_cases_per_week_df,"deaths": Total_deaths_per_week_df,"state": "california"})
COVID_table_df


Unnamed: 0_level_0,cases,deaths,state
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,1,0,california
4,16,0,california
5,42,0,california
6,48,0,california
7,54,0,california
8,138,0,california
9,419,4,california
10,1618,24,california
11,6585,119,california
12,24459,503,california


In [37]:
# Export file as a CSV, without the Pandas index, but with the header
COVID_table_df.to_csv("COVID_cleaned_data.csv", index=True, header=True )