In [1]:
import pandas as pd
import scipy as scp
import statistics as st
import numpy as np


In [2]:
Covid=pd.read_csv("Corona_Updated.csv")
Covid.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,Temprature,Humidity,Temp_Cat,Humid_Cat
0,Hubei,Mainland China,2020-03-10T15:13:05,67760,3024,47743,30.9756,112.2707,12.5,86,1,1
1,,Italy,2020-03-10T17:53:02,10149,631,724,43.0,12.0,12.9,64,1,1
2,,Iran (Islamic Republic of),2020-03-10T19:13:20,8042,291,2731,32.0,53.0,11.9,9,0,0
3,,Republic of Korea,2020-03-10T19:13:20,7513,54,247,36.0,128.0,4.9,41,0,0
4,,France,2020-03-10T18:53:02,1784,33,12,47.0,2.0,11.9,93,0,0


## Problem Statement

In [3]:
# Ojective:

# A common perception about COVID-19 is that Warm Climate is more resistant to the corona outbreak 
# and we need to verify this using Hypothesis Testing. 


#### Null and Alternate Hypothesis

In [4]:
# Based on the objective the 
# Null Hypothesis(H0)= Temperature does not affect covid outbreak
# Alternate Hypothesis(HA)= Temperature does affect covid outbreak

#### Data Preparation

In [5]:
#Note: We are considering Temperature below 24 as cold climate and above 24 as hot climate in our dataset

In [6]:
Covid.describe()
# Covid.isnull().sum()

Unnamed: 0,Confirmed,Deaths,Recovered,Latitude,Longitude,Temprature,Humidity,Temp_Cat,Humid_Cat
count,206.0,206.0,206.0,206.0,206.0,206.0,206.0,206.0,206.0
mean,575.640777,20.68932,312.640777,31.184989,11.75203,12.161165,67.728155,0.470874,0.470874
std,4822.697784,215.794845,3332.764713,21.305149,84.576291,10.229763,21.780588,0.500367,0.500367
min,0.0,0.0,0.0,-41.4545,-157.4983,-21.9,6.0,0.0,0.0
25%,3.0,0.0,0.0,25.0692,-74.841325,6.1,55.0,0.0,0.0
50%,12.0,0.0,0.0,36.03055,15.23425,11.75,73.0,0.0,0.0
75%,75.75,1.0,4.0,43.87025,101.363375,20.375,84.0,1.0,1.0
max,67760.0,3024.0,47743.0,64.9631,174.886,33.1,98.0,1.0,1.0


In [7]:
Covid['Temp_Cat'] = Covid['Temprature'].apply(lambda x : 0 if x < 24 else 1)

In [8]:
print(Covid.loc[Covid['Temp_Cat']==1].count())
print(Covid.loc[Covid['Temp_Cat']==0].count())


Province/State     4
Country/Region    31
Last Update       31
Confirmed         31
Deaths            31
Recovered         31
Latitude          31
Longitude         31
Temprature        31
Humidity          31
Temp_Cat          31
Humid_Cat         31
dtype: int64
Province/State     95
Country/Region    175
Last Update       175
Confirmed         175
Deaths            175
Recovered         175
Latitude          175
Longitude         175
Temprature        175
Humidity          175
Temp_Cat          175
Humid_Cat         175
dtype: int64


In [9]:
Covid_t = Covid[['Confirmed', 'Temp_Cat']]

In [10]:
Covid_t

Unnamed: 0,Confirmed,Temp_Cat
0,67760,0
1,10149,0
2,8042,0
3,7513,0
4,1784,0
...,...,...
201,0,0
202,0,0
203,0,0
204,0,0


### Implementing the Two-Sample Z test for the dataset.

In [12]:
d1= Covid_t[Covid_t['Temp_Cat']==1]['Confirmed']
d2= Covid_t[Covid_t['Temp_Cat']==0]['Confirmed']

# storing confirmed cases of hot and cold regions in different series object

In [13]:
print(d1.count(),d2.count())


31 175


##### Sample Mean 1 and Sample Mean 2 

In [14]:
m1,m2 = d1.mean(),d2.mean()  # finding mean confirmed cases of hot and cold regions

In [15]:
print(m1,m2)

26.548387096774192 672.9085714285715


##### Standard Deviation 1 and Standard Deviation 2

In [16]:
sd1,sd2 = d1.std(),d2.std()   # finding Standard deviation in confirmed cases of hot and cold regions

print(sd1,sd2)

##### Sample Size 1 and Sample Size 2

In [18]:
n1,n2 = d1.shape[0],d2.shape[0]  #finding the number of samples of confimed cases in hot and cold regions

In [19]:
print(n1,n2)

31 175


##### Population mean 1 and 2

In [67]:
# mudiff= mu1-mu2 as
# The population means of two groups are not significantly different in a 2 sample Z test as per null hypothesis.
mudiff=0

#### Hypothesis Test using Two Sample Z-Test

In [85]:
def TwoSampleZTest(X1,X2,mudiff,sigma1,sigma2,N1,N2):
    from scipy.stats import norm
    StdError_denom=np.sqrt(sigma1**2/N1 +sigma2**2/N2)
    z=((X1-X2)-mudiff)/StdError_denom
    pval=2*(1-norm.cdf(abs(z)))   # p value multiplied by 2 because of 2 tailed Z test
    return z,pval

# 1 - norm.cdf can be replaced in numpy using norm.sf for a better precision

In [88]:
Z,P = TwoSampleZTest(m1,m2,mudiff,sd1,sd2,n1,n2)

Z_Score= np.round(Z,8)
P_Value= np.round(P,6)

In [89]:
if P_Value <0.05:
    Hypothesis_status='Reject Null Hypothesis: Significant'
else:
    Hypothesis_status='Do not Reject Null Hypothesis: Not Significant'


#### Result

In [90]:

print(P_Value)
print(Hypothesis_status)

0.102054
Do not Reject Null Hypothesis: Not Significant


In [91]:
#chaging p-value formula by replacing norm.cdf in numpy with norm.sf

In [98]:
def TwoSampleZTest2(X1,X2,mudiff,sigma1,sigma2,N1,N2):
    from scipy.stats import norm
    StdError_denom=np.sqrt(sigma1**2/N1 +sigma2**2/N2)
    z=((X1-X2)-mudiff)/StdError_denom
    p_value = 2*(norm.sf(abs(z))) 
    return z,p_value



In [99]:
Z,P = TwoSampleZTest2(m1,m2,mudiff,sd1,sd2,n1,n2)

Z_Score= np.round(Z,8)
P_Value= np.round(P,6)

In [100]:
if P_Value <0.05:
    Hypothesis_status='Reject Null Hypothesis: Significant'
else:
    Hypothesis_status='Do not Reject Null Hypothesis: Not Significant'


In [101]:

print(P_Value)
print(Hypothesis_status)

0.102054
Do not Reject Null Hypothesis: Not Significant
