# Source

https://data.humdata.org/dataset/ebola-cases-2014

# Libraries

In [1]:
# to get web contents
import requests 
# scrap and clean web contents
from bs4 import BeautifulSoup

# numerical opeations
import numpy as np
# storing and processing in a dataframe
import pandas as pd

# Data

In [2]:
# Read data 
df = pd.read_csv('ebola_2014_2016_raw.csv', parse_dates=['Date'])
# First few rows
df.head()

Unnamed: 0,Indicator,Country,Date,value
0,"Cumulative number of confirmed, probable and s...",Guinea,2015-03-10,3285.0
1,Cumulative number of confirmed Ebola cases,Guinea,2015-03-10,2871.0
2,Cumulative number of probable Ebola cases,Guinea,2015-03-10,392.0
3,Cumulative number of suspected Ebola cases,Guinea,2015-03-10,22.0
4,"Cumulative number of confirmed, probable and s...",Guinea,2015-03-10,2170.0


In [3]:
# latest data
latest_data = df[df['Date'] == max(df['Date'])]

In [4]:
# no. of rows and columns
df.shape

(17585, 4)

In [5]:
# unique values in indicator columns
for i in df['Indicator'].unique():
    print(i)

Cumulative number of confirmed, probable and suspected Ebola cases
Cumulative number of confirmed Ebola cases
Cumulative number of probable Ebola cases
Cumulative number of suspected Ebola cases
Cumulative number of confirmed, probable and suspected Ebola deaths
Cumulative number of confirmed Ebola deaths
Cumulative number of probable Ebola deaths
Cumulative number of suspected Ebola deaths
Number of confirmed Ebola cases in the last 21 days
Number of confirmed, probable and suspected Ebola cases in the last 21 days
Number of probable Ebola cases in the last 21 days
Number of confirmed Ebola cases in the last 7 days
Number of probable Ebola cases in the last 7 days
Number of suspected Ebola cases in the last 7 days
Number of confirmed, probable and suspected Ebola cases in the last 7 days
Proportion of confirmed Ebola cases that are from the last 7 days
Proportion of probable Ebola cases that are from the last 7 days
Proportion of suspected Ebola cases that are from the last 7 days
Pro

In [6]:
# value count
df['Indicator'].value_counts()

Cumulative number of confirmed, probable and suspected Ebola deaths                            2485
Cumulative number of confirmed Ebola cases                                                     2484
Cumulative number of confirmed, probable and suspected Ebola cases                             2477
Cumulative number of probable Ebola cases                                                      2436
Cumulative number of suspected Ebola cases                                                     2366
Cumulative number of confirmed Ebola deaths                                                    1648
Cumulative number of probable Ebola deaths                                                     1526
Cumulative number of suspected Ebola deaths                                                    1308
Number of confirmed, probable and suspected Ebola cases in the last 21 days                     190
Number of confirmed Ebola cases in the last 21 days                                             190


# Data Cleaning

In [7]:
# get only confirmed cases and deaths
df = df[(df['Indicator']=='Cumulative number of confirmed Ebola cases')|
        (df['Indicator']=='Cumulative number of confirmed Ebola deaths')].reset_index(drop=True)
# first few rows
df.head()

Unnamed: 0,Indicator,Country,Date,value
0,Cumulative number of confirmed Ebola cases,Guinea,2015-03-10,2871.0
1,Cumulative number of confirmed Ebola deaths,Guinea,2015-03-10,1778.0
2,Cumulative number of confirmed Ebola cases,Liberia,2015-03-10,3150.0
3,Cumulative number of confirmed Ebola cases,Sierra Leone,2015-03-10,8428.0
4,Cumulative number of confirmed Ebola deaths,Sierra Leone,2015-03-10,3263.0


In [8]:
# pivot table to wider format
final_df = df.pivot_table(values='value', index=['Country', 'Date'], columns=['Indicator'])
# reset index
final_df = final_df.reset_index()
# sort values based on date
final_df = final_df.sort_values('Date').reset_index(drop=True)
# rename axis
final_df.rename_axis(None)
# first few values
final_df.head()

Indicator,Country,Date,Cumulative number of confirmed Ebola cases,Cumulative number of confirmed Ebola deaths
0,Guinea,2014-08-29,482.0,287.0
1,Nigeria,2014-08-29,15.0,6.0
2,Sierra Leone,2014-08-29,935.0,380.0
3,Liberia,2014-08-29,322.0,225.0
4,Guinea,2014-09-05,604.0,362.0


In [9]:
# column names
final_df.columns

Index(['Country', 'Date', 'Cumulative number of confirmed Ebola cases',
       'Cumulative number of confirmed Ebola deaths'],
      dtype='object', name='Indicator')

In [10]:
# subselect and rearrange columns
final_df = final_df[['Country', 'Date', 'Cumulative number of confirmed Ebola cases',  
                     'Cumulative number of confirmed Ebola deaths']]
# rename columns
final_df.columns = ['Country', 'Date', 'No. of confirmed cases',
                     'No. of confirmed deaths']
# first few rows
final_df.head()

Unnamed: 0,Country,Date,No. of confirmed cases,No. of confirmed deaths
0,Guinea,2014-08-29,482.0,287.0
1,Nigeria,2014-08-29,15.0,6.0
2,Sierra Leone,2014-08-29,935.0,380.0
3,Liberia,2014-08-29,322.0,225.0
4,Guinea,2014-09-05,604.0,362.0


In [11]:
# value count in country column
final_df['Country'].value_counts().sort_index()

Guinea                      259
Guinea 2                      1
Italy                       141
Liberia                     258
Liberia 2                   105
Mali                        243
Nigeria                     255
Senegal                     254
Sierra Leone                259
Spain                       243
United Kingdom              221
United States of America    245
Name: Country, dtype: int64

In [12]:
final_df['Date'].value_counts().sort_index(ascending=False)[:10]

2016-03-23    12
2015-12-29    11
2015-12-23    11
2015-12-22    11
2015-12-17    11
2015-12-16    11
2015-12-15    11
2015-12-11    11
2015-12-10    11
2015-12-09    11
Name: Date, dtype: int64

In [13]:
# replace values in country column
final_df['Country'] = final_df['Country'].replace(r'Liberia 2', 'Liberia', regex=True)
final_df['Country'] = final_df['Country'].replace(r'Guinea 2', 'Liberia', regex=True)
final_df.head()

Unnamed: 0,Country,Date,No. of confirmed cases,No. of confirmed deaths
0,Guinea,2014-08-29,482.0,287.0
1,Nigeria,2014-08-29,15.0,6.0
2,Sierra Leone,2014-08-29,935.0,380.0
3,Liberia,2014-08-29,322.0,225.0
4,Guinea,2014-09-05,604.0,362.0


# Save as csv file

In [14]:
# save dataframe as a .csv file
final_df.to_csv('ebola_2014_2016_clean.csv', index=False)

In [15]:
# latest_data.pivot_table(values='value', index=['Country', 'Date'], columns=['Indicator'])