In [1]:
%matplotlib notebook

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_rows', 500)

In [3]:
covid_df = pd.read_csv(os.path.join("COVID","03-16-2020.csv"))
world_pop_df = pd.read_csv(os.path.join('GDP', 'world_countries_gdp.csv'))
median_age_df = pd.read_csv(os.path.join('Median Age', 'output','median_age.csv'))

#number of countries in each csv
print(f"COVID-19 Countries: {covid_df['Country/Region'].nunique()}")
print(f"World Population Countries: {world_pop_df['country'].nunique()}")
print(f"Median Age Countries: {median_age_df['Country'].nunique()}")
#turn last update to datetime
#to_datetime method. exclude time. just keep date. use map - string slicing method

COVID-19 Countries: 156
World Population Countries: 211
Median Age Countries: 230


In [4]:
#get date only
#map takes in a function. So we use a temporary function (lambda)
covid_df['Last Update'] = covid_df['Last Update'].map(lambda x: x[0:10])

In [11]:
#clean data
grouped_covid_df = covid_df.groupby(['Country/Region']).sum()
grouped_covid_df.drop(['Latitude','Longitude'],1,inplace=True)
grouped_covid_df.rename(index = {"US":"United States","Taiwan*":"Taiwan"}, inplace=True)
grouped_covid_df.reset_index(inplace=True)
grouped_covid_df.rename(columns = {"Country/Region":"Country"}, inplace=True)
#grouped_covid_df.query('Country_Region.str.contains("Congo")',engine='python',inplace=True)

In [6]:
#clean data
grouped_world_pop_df = world_pop_df.groupby('country').sum()
grouped_world_pop_df.drop(['rank','unGDP'],1,inplace=True)
grouped_world_pop_df.rename(columns = {"imfGDP": "GDP","gdpPerCapita": "GDP Per Capita","pop":"Population"},inplace=True)
grouped_world_pop_df.index.name = 'Country'
grouped_world_pop_df.reset_index(inplace=True)

In [7]:
clean_median_age_df = median_age_df.copy()
#replace country column with string replace
clean_median_age_df['Country'] = clean_median_age_df['Country'].str.replace(pat='And',repl='and',case=False)
clean_median_age_df['Country'] = clean_median_age_df['Country'].str.replace(pat='The',repl='the',case=False)
clean_median_age_df['Country'] = clean_median_age_df['Country'].str.replace(pat='Congo, Democratic Republic Of the',repl='Republic of the Congo',case=False)
clean_median_age_df.sort_values(by='Country',ascending=True,inplace=True)
clean_median_age_df.reset_index(drop=True,inplace=True)

In [8]:
merged_df = grouped_covid_df.merge(grouped_world_pop_df,how='outer',on='Country')
merged_df.dropna(inplace=True)
merged_df.reset_index(drop=True,inplace=True)
merged_df['GDP Per Capita'] = merged_df['GDP Per Capita'].map('{:.0f}'.format)
merged_df['Population'] = (merged_df['Population'] * 1000).astype('int64')

In [9]:
complete_df = merged_df.merge(clean_median_age_df,how='outer',on='Country')
complete_df.sort_values(by='Country',ascending=True,inplace=True)
complete_df.reset_index(drop=True,inplace=True)
complete_df.dropna(inplace=True)
complete_df.reset_index(drop=True,inplace=True)
complete_df

Unnamed: 0,Country,Confirmed,Deaths,Recovered,GDP,GDP Per Capita,Population,Median Age
0,Afghanistan,21.0,0.0,1.0,20682000000.0,531,38928350.0,19.0
1,Albania,51.0,1.0,0.0,17210000000.0,5980,2877797.0,33.4
2,Algeria,54.0,4.0,12.0,193056000000.0,4403,43851040.0,28.3
3,Antigua and Barbuda,1.0,0.0,0.0,1809000000.0,18473,97929.0,32.2
4,Argentina,56.0,2.0,1.0,515353000000.0,11403,45195770.0,31.9
5,Armenia,52.0,0.0,0.0,13868000000.0,4680,2963243.0,35.6
6,Aruba,2.0,0.0,0.0,2952000000.0,27649,106766.0,39.5
7,Australia,377.0,3.0,23.0,1481460000000.0,58097,25499880.0,38.8
8,Austria,1018.0,3.0,6.0,481678000000.0,53482,9006398.0,44.2
9,Azerbaijan,15.0,1.0,6.0,47429000000.0,4678,10139180.0,31.7


In [10]:
complete_df.to_csv('output\complete_data.csv', index=False)