## Hypothesis: Warmer regions have lesser number of COVID-19 cases


### Comparing cases country-wise

In [64]:
import pandas as pd
import numpy as np
import datetime

In [65]:
new_cases = 'new_cases_per_million.csv'
total_cases = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/ecdc/total_cases_per_million.csv'

In [66]:
def get_newcases(csv_file):
    
    """Given a filename which has a comma separated (.csv) filetype,return a N dimensional 
    array (numpy.ndarray).
    
    We load the file as a pandas dataframe having no headers and load only the relevant columns 
    needed for analysis.
    
    :param csv_file: The csv file to load
    :return: a N dimensional array (numpy.ndarray)
    """
    data_file = pd.read_csv(csv_file,usecols=['date','Australia','United States','United States Virgin Islands']) # Importing only necessary columns from the dataset
    return data_file


In [67]:
covid_cases = get_newcases(new_cases)
covid_cases #time to process=?

Unnamed: 0,date,Australia,United States,United States Virgin Islands
0,2019-12-31,0.000,0.000,
1,2020-01-01,0.000,0.000,
2,2020-01-02,0.000,0.000,
3,2020-01-03,0.000,0.000,
4,2020-01-04,0.000,0.000,
...,...,...,...,...
117,2020-04-26,0.627,146.612,9.576
118,2020-04-27,0.392,81.138,0.000
119,2020-04-28,0.471,68.099,38.306
120,2020-04-29,0.510,72.906,0.000


In [68]:
covid_cases.rename({'date': 'Date'}, axis=1, inplace=True)
covid_cases

Unnamed: 0,Date,Australia,United States,United States Virgin Islands
0,2019-12-31,0.000,0.000,
1,2020-01-01,0.000,0.000,
2,2020-01-02,0.000,0.000,
3,2020-01-03,0.000,0.000,
4,2020-01-04,0.000,0.000,
...,...,...,...,...
117,2020-04-26,0.627,146.612,9.576
118,2020-04-27,0.392,81.138,0.000
119,2020-04-28,0.471,68.099,38.306
120,2020-04-29,0.510,72.906,0.000


In [69]:
def get_totalcases_cold(csv_file):
    
    """Given a filename which has a comma separated (.csv) filetype,return a N dimensional 
    array (numpy.ndarray).
    
    We load the file as a pandas dataframe having no headers and load only the relevant columns 
    needed for analysis.
    
    :param csv_file: The csv file to load
    :return: a N dimensional array (numpy.ndarray)
    """
    data_file = pd.read_csv(csv_file,usecols=['date','United States','United Kingdom','Italy','Spain','China']) # Importing only necessary columns from the dataset
    return data_file

In [70]:
colder_country_cases = get_totalcases_cold(total_cases)
colder_country_cases

Unnamed: 0,date,China,Italy,Spain,United Kingdom,United States
0,2019-12-31,0.019,0.000,0.000,0.000,0.000
1,2020-01-01,0.019,0.000,0.000,0.000,0.000
2,2020-01-02,0.019,0.000,0.000,0.000,0.000
3,2020-01-03,0.031,0.000,0.000,0.000,0.000
4,2020-01-04,0.031,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...
121,2020-04-30,58.322,3367.265,4564.988,2433.801,3141.694
122,2020-05-01,58.330,3398.227,4603.080,2522.655,3232.077
123,2020-05-02,58.332,3430.727,4632.296,2614.000,3334.659
124,2020-05-03,58.334,3462.151,4651.203,2684.795,3423.142


In [71]:
colder_country_cases.isna().sum()

date              0
China             0
Italy             0
Spain             1
United Kingdom    0
United States     0
dtype: int64

In [72]:
colder_country_cases['Spain'] = colder_country_cases['Spain'].fillna('0') #replace null values with zero
colder_country_cases.rename({'date': 'Date'}, axis=1, inplace=True)
colder_country_cases = colder_country_cases[colder_country_cases['Date'] > '2019-12-31']
colder_country_cases

Unnamed: 0,Date,China,Italy,Spain,United Kingdom,United States
1,2020-01-01,0.019,0.000,0,0.000,0.000
2,2020-01-02,0.019,0.000,0,0.000,0.000
3,2020-01-03,0.031,0.000,0,0.000,0.000
4,2020-01-04,0.031,0.000,0,0.000,0.000
5,2020-01-05,0.041,0.000,0,0.000,0.000
...,...,...,...,...,...,...
121,2020-04-30,58.322,3367.265,4564.99,2433.801,3141.694
122,2020-05-01,58.330,3398.227,4603.08,2522.655,3232.077
123,2020-05-02,58.332,3430.727,4632.3,2614.000,3334.659
124,2020-05-03,58.334,3462.151,4651.2,2684.795,3423.142


In [86]:
colder_country_cases['Date'] =  pd.to_datetime(colder_country_cases['Date'])
colder_country_cases["Spain"] = colder_country_cases["Spain"].astype(str).astype(float)
colder_country_cases.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Date              datetime64[ns]
China                    float64
Italy                    float64
Spain                    float64
United Kingdom           float64
United States            float64
dtype: object

In [73]:
def get_totalcases_hot(csv_file):
    
    """Given a filename which has a comma separated (.csv) filetype,return a N dimensional 
    array (numpy.ndarray).
    
    We load the file as a pandas dataframe having no headers and load only the relevant columns 
    needed for analysis.
    
    :param csv_file: The csv file to load
    :return: a N dimensional array (numpy.ndarray)
    """
    data_file = pd.read_csv(csv_file,usecols=['date','Australia','Brazil','Chile','South Africa','New Zealand']) # Importing only necessary columns from the dataset
    return data_file

In [74]:
warmer_country_cases = get_totalcases_hot(total_cases)
warmer_country_cases

Unnamed: 0,date,Australia,Brazil,Chile,New Zealand,South Africa
0,2019-12-31,0.000,0.000,,0.000,
1,2020-01-01,0.000,0.000,,0.000,
2,2020-01-02,0.000,0.000,,0.000,
3,2020-01-03,0.000,0.000,,0.000,
4,2020-01-04,0.000,0.000,,0.000,
...,...,...,...,...,...,...
121,2020-04-30,264.550,367.718,778.659,234.124,90.206
122,2020-05-01,265.178,401.676,838.189,234.746,95.214
123,2020-05-02,265.374,430.887,889.716,235.161,100.339
124,2020-05-03,266.001,454.268,964.365,235.576,106.831


In [75]:
warmer_country_cases.isna().sum()

date             0
Australia        0
Brazil           0
Chile           65
New Zealand      9
South Africa    68
dtype: int64

In [76]:
warmer_country_cases['Chile']=warmer_country_cases['Chile'].fillna('0')
warmer_country_cases['New Zealand']=warmer_country_cases['New Zealand'].fillna('0')
warmer_country_cases['South Africa']=warmer_country_cases['South Africa'].fillna('0')
warmer_country_cases.rename({'date': 'Date'}, axis=1, inplace=True)
warmer_country_cases = warmer_country_cases[warmer_country_cases['Date'] > '2019-12-31']
warmer_country_cases

Unnamed: 0,Date,Australia,Brazil,Chile,New Zealand,South Africa
1,2020-01-01,0.000,0.000,0,0,0
2,2020-01-02,0.000,0.000,0,0,0
3,2020-01-03,0.000,0.000,0,0,0
4,2020-01-04,0.000,0.000,0,0,0
5,2020-01-05,0.000,0.000,0,0,0
...,...,...,...,...,...,...
121,2020-04-30,264.550,367.718,778.659,234.124,90.206
122,2020-05-01,265.178,401.676,838.189,234.746,95.214
123,2020-05-02,265.374,430.887,889.716,235.161,100.339
124,2020-05-03,266.001,454.268,964.365,235.576,106.831


In [77]:
warmer_country_cases.dtypes


Date             object
Australia       float64
Brazil          float64
Chile            object
New Zealand      object
South Africa     object
dtype: object

In [82]:
warmer_country_cases['Date'] =  pd.to_datetime(warmer_country_cases['Date'])
warmer_country_cases["Chile"] = warmer_country_cases["Chile"].astype(str).astype(float)
warmer_country_cases["New Zealand"] = warmer_country_cases["New Zealand"].astype(str).astype(float)
warmer_country_cases["South Africa"] = warmer_country_cases["South Africa"].astype(str).astype(float)
warmer_country_cases.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

Date            datetime64[ns]
Australia              float64
Brazil                 float64
Chile                  float64
New Zealand            float64
South Africa           float64
dtype: object

In [92]:
def groupby_week(csvfile):
    """Given a filename which has a comma separated (.csv) filetype,return a N dimensional 
    array (numpy.ndarray).
    
    We load the file as a pandas dataframe having no headers and load only the relevant columns 
    needed for analysis.
    
    :param csv_file: The csv file to load
    :return: a N dimensional array (numpy.ndarray)
    """
    databy_week = csvfile.groupby(csvfile['Date'].dt.week).sum()
    return databy_week

In [95]:
cases_perweek_warm = groupby_week(warmer_country_cases)
cases_perweek_warm


Unnamed: 0_level_0,Australia,Brazil,Chile,New Zealand,South Africa
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.196,0.0,0.0,0.0,0.0
5,1.765,0.0,0.0,0.0,0.0
6,3.49,0.0,0.0,0.0,0.0
7,3.96,0.0,0.0,0.0,0.0
8,4.706,0.0,0.0,0.0,0.0
9,6.393,0.029,0.0,0.621,0.0
10,13.765,0.201,0.68,3.11,0.051


In [96]:
cases_perweek_cold = groupby_week(colder_country_cases)
cases_perweek_cold

Unnamed: 0_level_0,China,Italy,Spain,United Kingdom,United States
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.141,0.0,0.0,0.0,0.0
2,0.287,0.0,0.0,0.0,0.0
3,0.414,0.0,0.0,0.0,0.0
4,4.142,0.0,0.0,0.0,0.024
5,39.569,0.15,0.042,0.087,0.123
6,134.154,0.35,0.147,0.263,0.243
7,268.624,0.35,0.301,0.827,0.291
8,361.949,1.838,0.301,0.931,0.44
9,382.091,62.007,3.188,1.604,1.192
10,391.518,391.866,34.499,10.251,4.471
