In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
##reading the confirmed-covid-19-deaths-in-us-by-state-and-county copy.csv file
df = pd.read_csv("confirmed-covid-19-deaths-in-us-by-state-and-county copy.csv")
df.tail()

Unnamed: 0.1,Unnamed: 0,county_fips,county_name,state_name,state_fips,date,deaths,lat,long,geometry
600655,600655,56045,Weston County,WY,56,2020-07-23,0,43.839612,-104.567488,POINT (-104.5674881 43.83961191)
600656,600656,56045,Weston County,WY,56,2020-07-24,0,43.839612,-104.567488,POINT (-104.5674881 43.83961191)
600657,600657,56045,Weston County,WY,56,2020-07-25,0,43.839612,-104.567488,POINT (-104.5674881 43.83961191)
600658,600658,56045,Weston County,WY,56,2020-07-26,0,43.839612,-104.567488,POINT (-104.5674881 43.83961191)
600659,600659,56045,Weston County,WY,56,2020-07-27,0,43.839612,-104.567488,POINT (-104.5674881 43.83961191)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600660 entries, 0 to 600659
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   600660 non-null  int64  
 1   county_fips  600660 non-null  int64  
 2   county_name  600660 non-null  object 
 3   state_name   600660 non-null  object 
 4   state_fips   600660 non-null  int64  
 5   date         600660 non-null  object 
 6   deaths       600660 non-null  int64  
 7   lat          590696 non-null  float64
 8   long         590696 non-null  float64
 9   geometry     590696 non-null  object 
dtypes: float64(2), int64(4), object(4)
memory usage: 45.8+ MB


In [4]:
df.isna().sum()
##there are no null values except for lat and long which we can ignore 

Unnamed: 0        0
county_fips       0
county_name       0
state_name        0
state_fips        0
date              0
deaths            0
lat            9964
long           9964
geometry       9964
dtype: int64

In [5]:
df.nunique()

Unnamed: 0     600660
county_fips      3146
county_name      1882
state_name         51
state_fips         51
date              188
deaths           2233
lat              3142
long             3142
geometry         3142
dtype: int64

In [6]:
##changing data types in the dataset
df["deaths"] = df["deaths"].apply(pd.to_numeric)
df['date'] = pd.to_datetime(df['date'], format='%Y/%m/%d')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600660 entries, 0 to 600659
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Unnamed: 0   600660 non-null  int64         
 1   county_fips  600660 non-null  int64         
 2   county_name  600660 non-null  object        
 3   state_name   600660 non-null  object        
 4   state_fips   600660 non-null  int64         
 5   date         600660 non-null  datetime64[ns]
 6   deaths       600660 non-null  int64         
 7   lat          590696 non-null  float64       
 8   long         590696 non-null  float64       
 9   geometry     590696 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(4), object(3)
memory usage: 45.8+ MB


In [7]:
##standardising state names in both datasets
states = {"AL":"Alabama", "AK":"Alaska", "AZ":"Arizona", "AR":"Arkansas", "CA":"California", "CO":"Colorado", "CT":"Connecticut", 
          "DC":"Washington DC", "DE":"Delaware", "FL":"Florida", "GA":"Georgia", "HI":"Hawaii", "ID":"Idaho", "IL":"Illinois", 
          "IN":"Indiana", "IA":"Iowa", "KS":"Kansas", "KY":"Kentucky", "LA":"Louisiana", "ME":"Maine", "MD":"Maryland",
          "MA":"Massachusetts", "MI":"Michigan", "MN":"Minnesota", "MS":"Mississippi", "MO":"Missouri", "MT":"Montana",
          "NE":"Nebraska", "NV":"Nevada", "NH":"New Hampshire", "NJ":"New Jersey", "NM":"New Mexico", "NY":"New York", 
          "NC":"North Carolina", "ND":"North Dakota", "OH":"Ohio", "OK":"Oklahoma", "OR":"Oregon", "PA":"Pennsylvania", 
          "RI":"Rhode Island", "SC":"South Carolina", "SD":"South Dakota", "TN":"Tennessee", "TX":"Texas", "UT":"Utah", "VT":"Vermont",
          "VA":"Virginia", "WA":"Washington", "WV":"West Virginia","WI":"Wisconsin", "WY":"Wyoming"}
 
df["States_long"] = df.state_name.map(states)
df[["state_name", "States_long"]]
df.head()

Unnamed: 0.1,Unnamed: 0,county_fips,county_name,state_name,state_fips,date,deaths,lat,long,geometry,States_long
0,0,0,Statewide Unallocated,AL,1,2020-01-22,0,,,,Alabama
1,1,0,Statewide Unallocated,AL,1,2020-01-23,0,,,,Alabama
2,2,0,Statewide Unallocated,AL,1,2020-01-24,0,,,,Alabama
3,3,0,Statewide Unallocated,AL,1,2020-01-25,0,,,,Alabama
4,4,0,Statewide Unallocated,AL,1,2020-01-26,0,,,,Alabama


In [8]:
##reading the covid-19-state-level-data copy.csv file
df1 = pd.read_csv("covid-19-state-level-data copy.csv")
df1.tail()

Unnamed: 0.1,Unnamed: 0,date,state,fips,cases,deaths
8149,8149,2020-07-28,Virginia,51,86994,2095
8150,8150,2020-07-28,Washington,53,56576,1633
8151,8151,2020-07-28,West Virginia,54,6173,111
8152,8152,2020-07-28,Wisconsin,55,54114,916
8153,8153,2020-07-28,Wyoming,56,2589,26


In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  8154 non-null   int64 
 1   date        8154 non-null   object
 2   state       8154 non-null   object
 3   fips        8154 non-null   int64 
 4   cases       8154 non-null   int64 
 5   deaths      8154 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 382.3+ KB


In [10]:
df1.isna().sum()

Unnamed: 0    0
date          0
state         0
fips          0
cases         0
deaths        0
dtype: int64

In [11]:
df1.nunique()

Unnamed: 0    8154
date           190
state           55
fips            55
cases         6097
deaths        2572
dtype: int64

In [12]:
##changing data types in the dataset
df1[["deaths","cases"]] = df1[["deaths","cases"]].apply(pd.to_numeric)
df1['date'] = pd.to_datetime(df['date'], format='%Y/%m/%d')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600660 entries, 0 to 600659
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Unnamed: 0   600660 non-null  int64         
 1   county_fips  600660 non-null  int64         
 2   county_name  600660 non-null  object        
 3   state_name   600660 non-null  object        
 4   state_fips   600660 non-null  int64         
 5   date         600660 non-null  datetime64[ns]
 6   deaths       600660 non-null  int64         
 7   lat          590696 non-null  float64       
 8   long         590696 non-null  float64       
 9   geometry     590696 non-null  object        
 10  States_long  600660 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(4), object(4)
memory usage: 50.4+ MB


In [13]:
import datacompy

compare = datacompy.Compare(
df,
df1,
join_columns='date',
abs_tol=0.0001,
rel_tol=0,
df1_name='county_level',
df2_name='state_level')
print(compare.report())

DataComPy Comparison
--------------------

DataFrame Summary
-----------------

      DataFrame  Columns    Rows
0  county_level       11  600660
1   state_level        6    8154

Column Summary
--------------

Number of columns in common: 3
Number of columns in county_level but not in state_level: 8
Number of columns in state_level but not in county_level: 3

Row Summary
-----------

Matched on: date
Any duplicates on match values: Yes
Absolute Tolerance: 0.0001
Relative Tolerance: 0
Number of rows in common: 8,154
Number of rows in county_level but not in state_level: 592,506
Number of rows in state_level but not in county_level: 0

Number of rows with some compared columns unequal: 7,512
Number of rows with all compared columns equal: 642

Column Comparison
-----------------

Number of columns compared with some values unequal: 1
Number of columns compared with all values equal: 2
Total number of values which compare unequal: 7,512

Columns with Unequal Values or Types
-------------

In [14]:
##preparing both the datasets for a t test to check if the data is similar to each other
df3=df.groupby(by=["state_name"]).sum()
df3.head()

df4=df1.groupby(by=["state"]).sum()
df4.head()

Unnamed: 0_level_0,unnamed: 0,fips,cases,deaths
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,597881,138,2964684,75974
Alaska,598558,278,101302,1149
Arizona,606073,740,5441642,131475
Arkansas,599331,700,1456978,19575
California,606403,1116,18225066,491864


In [15]:
##conducting a t test to see if both the datasets are similar
##the common data attributes between both the datasets are deaths, we will compare both the deaths and check if 
##they are similar to each other
deaths1=pd.DataFrame(df3.deaths)
deaths2=pd.DataFrame(df4.deaths)
data1 = deaths1.to_numpy()
data2 = deaths2.to_numpy()

In [16]:
v1, v2 = (np.var(data1), np.var(data2))
print(v1, v2)
print("ratio of larger variance to smaller variance is:", v1/v2)


213218181170.12457 205709109853.36987
ratio of larger variance to smaller variance is: 1.0365033484521282


In [17]:
import scipy.stats as stats
#perform two sample t-test with equal variances
stats.ttest_ind(a=deaths1, b=deaths2, equal_var=True)


##Because the p-value (= 0.9) > alpha (= 0.05), 
##we reject the null hypothesis and can say that the datasets are similar to each other, 
##i.e., the sample means in both datasets are similar

Ttest_indResult(statistic=array([0.12081597]), pvalue=array([0.90406991]))

In [18]:
##calculating Case Fatality rate in the state level covid file
df1['Case_fatality_rate']=(df1['deaths']/df1['cases'])*100
display(df1)

Unnamed: 0,unnamed: 0,date,state,fips,cases,deaths,Case_fatality_rate
0,0,2020-01-22,Washington,53,1,0,0.000000
1,1,2020-01-23,Washington,53,1,0,0.000000
2,2,2020-01-24,Washington,53,1,0,0.000000
3,3,2020-01-25,Illinois,17,1,0,0.000000
4,4,2020-01-26,Washington,53,1,0,0.000000
...,...,...,...,...,...,...,...
8149,8149,2020-03-27,Virginia,51,86994,2095,2.408212
8150,8150,2020-03-28,Washington,53,56576,1633,2.886383
8151,8151,2020-03-29,West Virginia,54,6173,111,1.798153
8152,8152,2020-03-30,Wisconsin,55,54114,916,1.692723


In [29]:
##exporting the data to excel for visualisation in tableau
df.to_excel(r'C:\Users\Sharon\Documents\portfolio project\countylevelcoviddata.xlsx')
df1.to_excel(r'C:\Users\Sharon\Documents\portfolio project\Covidstateleveldata.xlsx')

In [20]:
## extracting monthly case data for 4 months
##month of March

print("top states and counties with deaths for the month of March")
mar = df[df['date'].dt.strftime('%Y-%m') == '2020-03']

state_mar=mar.groupby(['states_long']).agg({'deaths':'sum'})
largest3 = pd.DataFrame(state_mar["deaths"].nlargest(3)).reset_index()
largest3=largest3.drop(['deaths'], axis=1)
v=largest3.squeeze()
a=largest3.values.tolist()

from itertools import chain
l = list(chain(*a))
display(l)

mar= mar.groupby(['states_long','county_name']).agg({'deaths':'sum'})

mar1=pd.DataFrame(mar).reset_index()

mar2 = mar1[mar1['states_long'].isin(l)]
mar3=mar2.groupby(['states_long','county_name']).agg({'deaths':'sum'})

grouped = mar3.groupby(['states_long'])['deaths'].nlargest(3).sort_values

display(grouped)

top states and counties with deaths for the month of March


['New York', 'Washington', 'California']

<bound method Series.sort_values of states_long  states_long  county_name                       
California   California   Santa Clara County                     336
                          Los Angeles County                     266
                          Riverside County                       100
New York     New York     New York City Unallocated/Probable    1903
                          Queens County                         1636
                          Kings County                          1169
Washington   Washington   King County                           1788
                          Snohomish County                       263
                          Clark County                            61
Name: deaths, dtype: int64>

In [21]:
## extracting monthly case data for 4 months
##month of April

print("top states and counties with deaths for the month of April")
mar = df[df['date'].dt.strftime('%Y-%m') == '2020-04']

state_mar=mar.groupby(['states_long']).agg({'deaths':'sum'})
largest3 = pd.DataFrame(state_mar["deaths"].nlargest(3)).reset_index()
largest3=largest3.drop(['deaths'], axis=1)
v=largest3.squeeze()
a=largest3.values.tolist()

from itertools import chain
l = list(chain(*a))
display(l)

mar= mar.groupby(['states_long','county_name']).agg({'deaths':'sum'})

mar1=pd.DataFrame(mar).reset_index()

mar2 = mar1[mar1['states_long'].isin(l)]
mar3=mar2.groupby(['states_long','county_name']).agg({'deaths':'sum'})

grouped = mar3.groupby(['states_long'])['deaths'].nlargest(3).sort_values

display(grouped)

top states and counties with deaths for the month of April


['New York', 'New Jersey', 'Michigan']

<bound method Series.sort_values of states_long  states_long  county_name   
Michigan     Michigan     Wayne County      27930
                          Oakland County    11879
                          Macomb County      9481
New Jersey   New Jersey   Essex County      17920
                          Bergen County     17886
                          Hudson County     10778
New York     New York     Kings County      86067
                          Queens County     84238
                          Bronx County      58916
Name: deaths, dtype: int64>

In [22]:
## extracting monthly case data for 4 months
##month of May

print("top states and counties with deaths for the month of May")
mar = df[df['date'].dt.strftime('%Y-%m') == '2020-05']

state_mar=mar.groupby(['states_long']).agg({'deaths':'sum'})
largest3 = pd.DataFrame(state_mar["deaths"].nlargest(3)).reset_index()
largest3=largest3.drop(['deaths'], axis=1)
v=largest3.squeeze()
a=largest3.values.tolist()

from itertools import chain
l = list(chain(*a))
display(l)

mar= mar.groupby(['states_long','county_name']).agg({'deaths':'sum'})

mar1=pd.DataFrame(mar).reset_index()

mar2 = mar1[mar1['states_long'].isin(l)]
mar3=mar2.groupby(['states_long','county_name']).agg({'deaths':'sum'})

grouped = mar3.groupby(['states_long'])['deaths'].nlargest(3).sort_values

display(grouped)

top states and counties with deaths for the month of May


['New York', 'New Jersey', 'Massachusetts']

<bound method Series.sort_values of states_long    states_long    county_name     
Massachusetts  Massachusetts  Middlesex County     40968
                              Suffolk County       23303
                              Essex County         22590
New Jersey     New Jersey     Essex County         46204
                              Bergen County        44016
                              Hudson County        31944
New York       New York       Kings County        192331
                              Queens County       180926
                              Bronx County        128759
Name: deaths, dtype: int64>

In [23]:
## extracting monthly case data for 4 months
##month of June

print("top states and counties with deaths for the month of June")
mar = df[df['date'].dt.strftime('%Y-%m') == '2020-06']

state_mar=mar.groupby(['states_long']).agg({'deaths':'sum'})
largest3 = pd.DataFrame(state_mar["deaths"].nlargest(3)).reset_index()
largest3=largest3.drop(['deaths'], axis=1)
v=largest3.squeeze()
a=largest3.values.tolist()

from itertools import chain
l = list(chain(*a))
display(l)

mar= mar.groupby(['states_long','county_name']).agg({'deaths':'sum'})

mar1=pd.DataFrame(mar).reset_index()

mar2 = mar1[mar1['states_long'].isin(l)]
mar3=mar2.groupby(['states_long','county_name']).agg({'deaths':'sum'})

grouped = mar3.groupby(['states_long'])['deaths'].nlargest(3).sort_values

display(grouped)

top states and counties with deaths for the month of June


['New York', 'New Jersey', 'Massachusetts']

<bound method Series.sort_values of states_long    states_long    county_name     
Massachusetts  Massachusetts  Middlesex County     52919
                              Essex County         31351
                              Suffolk County       28638
New Jersey     New Jersey     Essex County         53135
                              Bergen County        50983
                              Hudson County        38048
New York       New York       Kings County        207704
                              Queens County       194043
                              Bronx County        137984
Name: deaths, dtype: int64>