In [1]:
%matplotlib notebook

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_rows', 500)

In [3]:
covid_df = pd.read_csv(os.path.join("COVID","03-16-2020.csv"))
world_pop_df = pd.read_csv(os.path.join('GDP', 'world_countries_gdp.csv'))
median_age_df = pd.read_csv(os.path.join('Median Age', 'output','median_age.csv'))

#number of countries in each csv
print(f"COVID-19 Countries: {covid_df['Country/Region'].nunique()}")
print(f"World Population Countries: {world_pop_df['country'].nunique()}")
print(f"Median Age Countries: {median_age_df['Country'].nunique()}")

COVID-19 Countries: 156
World Population Countries: 211
Median Age Countries: 259


In [4]:
#get date only
#map takes in a function. So we use a temporary function (lambda)
covid_df['Last Update'] = covid_df['Last Update'].map(lambda x: x[0:10])

In [5]:
#clean data
grouped_covid_df = covid_df.groupby(['Country/Region']).sum()
grouped_covid_df.drop(['Latitude','Longitude'],1,inplace=True)
grouped_covid_df.rename(index = {"US":"United States","Taiwan*":"Taiwan"}, inplace=True)
grouped_covid_df.reset_index(inplace=True)
grouped_covid_df.rename(columns = {"Country/Region":"Country"}, inplace=True)
#grouped_covid_df.query('Country_Region.str.contains("Congo")',engine='python',inplace=True)

In [6]:
#clean data
grouped_world_pop_df = world_pop_df.groupby('country').sum()
grouped_world_pop_df.drop(['rank','unGDP'],1,inplace=True)
grouped_world_pop_df.rename(index = {"South Korea":"Korea, South","Bahamas":"The Bahamas"}, 
                            columns = {"imfGDP": "GDP","gdpPerCapita": "GDP Per Capita","pop":"Population"},
                            inplace=True)
grouped_world_pop_df.index.name = 'Country'
grouped_world_pop_df.reset_index(inplace=True)

In [11]:
clean_median_age_df = median_age_df.copy()
#replace country column with string replace
clean_median_age_df['Country'] = clean_median_age_df['Country'].str.replace(pat='And',repl='and',case=False)
clean_median_age_df['Country'] = clean_median_age_df['Country'].str.replace(pat='The',repl='the',case=False)
clean_median_age_df['Country'] = clean_median_age_df['Country'].str.replace(pat='Congo, Democratic Republic Of the',repl='Republic of the Congo',case=False)
clean_median_age_df['Country'] = clean_median_age_df['Country'].str.replace(pat='andorra',repl='Andorra',case=False)
clean_median_age_df['Country'] = clean_median_age_df['Country'].str.replace(pat='Bahamas, the',repl='The Bahamas',case=False)
clean_median_age_df.sort_values(by='Country',ascending=True,inplace=True)
clean_median_age_df.reset_index(drop=True,inplace=True)

In [8]:
merged_df = grouped_covid_df.merge(grouped_world_pop_df,how='outer',on='Country')
#merged_df.dropna(inplace=True)
merged_df.reset_index(drop=True,inplace=True)
#merged_df['GDP Per Capita'] = merged_df['GDP Per Capita'].map('{:.0f}'.format)
#merged_df['Population'] = (merged_df['Population'] * 1000).astype('int64')

In [9]:
complete_df = merged_df.merge(clean_median_age_df,how='outer',on='Country')
complete_df.sort_values(by='Country',ascending=True,inplace=True)
complete_df.reset_index(drop=True,inplace=True)
#complete_df.dropna(inplace=True)
#complete_df.reset_index(drop=True,inplace=True)

complete_df

Unnamed: 0,Country,Confirmed,Deaths,Recovered,GDP,GDP Per Capita,Population,Median Age
0,Afghanistan,21.0,0.0,1.0,20682000000.0,531.2838,38928.346,19.0
1,Akrotiri,,,,,,,
2,Albania,51.0,1.0,0.0,17210000000.0,5980.2689,2877.797,33.4
3,Algeria,54.0,4.0,12.0,193056000000.0,4402.5406,43851.044,28.3
4,American Samoa,,,,,,,26.1
5,Andorra,2.0,0.0,1.0,0.0,36994.8441,77.265,44.9
6,Angola,,,,96426000000.0,2933.8892,32866.272,15.9
7,Anguilla,,,,0.0,22496.8653,15.003,35.1
8,Antarctica,,,,,,,
9,Antigua and Barbuda,1.0,0.0,0.0,1809000000.0,18472.5669,97.929,32.2


In [10]:
complete_df.to_csv('output\complete_data.csv', index=False)