In [59]:
import os
os.chdir('C:\\Users\\M246047\\Documents\\Python')
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_openml
%matplotlib inline

from PIL import Image
from matplotlib import image
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [60]:
outbreaks = pd.read_csv('nigeria_outbreaks.csv')
outbreaks = pd.DataFrame(outbreaks)
print(outbreaks.columns)
print(outbreaks.info())

outbreaks.drop(columns=['id', 'gender', 'settlement', 'age_str'], inplace=True)
outbreaks.columns

Index(['id', 'surname', 'firstname', 'middlename', 'gender', 'gender_male',
       'gender_female', 'state', 'settlement', 'rural_settlement',
       'urban_settlement', 'report_date', 'report_year', 'age', 'age_str',
       'date_of_birth', 'child_group', 'adult_group', 'disease', 'cholera',
       'diarrhoea', 'measles', 'viral_haemmorrhaphic_fever', 'meningitis',
       'ebola', 'marburg_virus', 'yellow_fever', 'rubella_mars', 'malaria',
       'serotype', 'NmA', 'NmC', 'NmW', 'health_status', 'alive', 'dead',
       'report_outcome', 'unconfirmed', 'confirmed', 'null_serotype'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284484 entries, 0 to 284483
Data columns (total 40 columns):
id                            284484 non-null int64
surname                       284484 non-null object
firstname                     284484 non-null object
middlename                    284484 non-null object
gender                        284484 non-null object
gender_male  

Index(['surname', 'firstname', 'middlename', 'gender_male', 'gender_female',
       'state', 'rural_settlement', 'urban_settlement', 'report_date',
       'report_year', 'age', 'date_of_birth', 'child_group', 'adult_group',
       'disease', 'cholera', 'diarrhoea', 'measles',
       'viral_haemmorrhaphic_fever', 'meningitis', 'ebola', 'marburg_virus',
       'yellow_fever', 'rubella_mars', 'malaria', 'serotype', 'NmA', 'NmC',
       'NmW', 'health_status', 'alive', 'dead', 'report_outcome',
       'unconfirmed', 'confirmed', 'null_serotype'],
      dtype='object')

# The Data

[Nigerian Disease Outbreaks](https://www.kaggle.com/eiodelami/disease-outbreaks-in-nigeria-datasets)

While at first glance there appear to be no nulls, it is apparent in the data that null values have been filled with the word 'Null' and the date '0000-00-00. I'll go through and replace each of these cells with NaNs. Additionally, I'll convert all strings to lowercase to ensure I don't have multiple variables for the same values.

After these changes, I've used df.info and df.value_counts to ensure that all variables are sound - there are no repeats of strings due to mispellings or null values. Columns report_date, date_of_birth, and serotype are missing values. Additioally, serotype has one entry of 'null serotype'. I will decide how to handle these as I move along.



In [61]:
outbreaks.report_date.dtype

dtype('O')

In [62]:
# replacing all 'Null' values and '0000-00-00' with NaNs
outbreaks = outbreaks.applymap(lambda x: np.nan if x=='Null' else x)
outbreaks = outbreaks.applymap(lambda x: np.nan if x=='0000-00-00' else x)

# making all strings lowercase 
outbreaks = outbreaks.applymap(lambda x: x.lower() if type(x) == str  else x)

pd.options.display.max_columns = None
outbreaks.head(30)

Unnamed: 0,surname,firstname,middlename,gender_male,gender_female,state,rural_settlement,urban_settlement,report_date,report_year,age,date_of_birth,child_group,adult_group,disease,cholera,diarrhoea,measles,viral_haemmorrhaphic_fever,meningitis,ebola,marburg_virus,yellow_fever,rubella_mars,malaria,serotype,NmA,NmC,NmW,health_status,alive,dead,report_outcome,unconfirmed,confirmed,null_serotype
0,solade,grace,solape,0,1,rivers,1,0,5/15/2018,2018,32,1/17/1986,0,1,cholera,1,0,0,0,0,0,0,0,0,0,null serotype,0,0,0,alive,1,0,confirmed,0,1,1
1,eneche,kure,balogun,1,0,ebonyi,1,0,2/28/2017,2017,32,9/1/1985,0,1,marburg virus,0,0,0,0,0,0,1,0,0,0,,0,0,1,alive,1,0,confirmed,1,0,1
2,sanusi,adaugo,katerine,0,1,ogun,0,1,3/2/2012,2012,24,5/1/1988,0,1,marburg virus,0,0,0,0,0,0,1,0,0,0,,0,0,1,dead,0,1,not confirmed,1,0,1
3,sowore,mooslemat,ifedayo,0,1,ondo,1,0,5/21/2010,2010,63,5/24/1947,0,1,measles,0,0,1,0,0,0,0,0,0,0,,0,0,1,alive,1,0,not confirmed,1,0,1
4,abdusalam,yusuf,okafor,1,0,oyo,0,1,8/28/2017,2017,9,1/7/2008,1,0,rubella mars,0,0,0,0,0,0,0,0,1,0,,0,0,1,alive,1,0,confirmed,1,0,1
5,yakubu,janet,chioma,0,1,kaduna,1,0,6/27/2012,2012,44,6/6/1968,0,1,ebola,0,0,0,0,0,1,0,0,0,0,,0,0,1,alive,1,0,confirmed,1,0,1
6,razak,adaugo,adaobi,0,1,taraba,1,0,4/1/2010,2010,61,7/19/1949,0,1,yellow fever,0,0,0,0,0,0,0,1,0,0,,0,0,1,alive,1,0,confirmed,0,1,1
7,annakyi,danmbazzu,osagie,1,0,katsina,1,0,10/4/2015,2015,2,12/18/2014,1,0,diarrhoea,0,1,0,0,0,0,0,0,0,0,,0,0,1,alive,1,0,not confirmed,1,0,1
8,adejoro,iyin,osatimehin,1,0,katsina,1,0,11/14/2011,2011,54,8/5/1957,0,1,rubella mars,0,0,0,0,0,0,0,0,1,0,,0,0,1,alive,1,0,confirmed,1,0,1
9,okorie,adaugo,chika,0,1,osun,0,1,6/17/2014,2014,15,10/19/1999,1,0,marburg virus,0,0,0,0,0,0,1,0,0,0,,0,0,1,alive,1,0,confirmed,1,0,1


In [63]:
outbreaks.info()

for column in outbreaks.columns:
    print(column, '\n \n')
    print(outbreaks[column].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284484 entries, 0 to 284483
Data columns (total 36 columns):
surname                       284484 non-null object
firstname                     284484 non-null object
middlename                    284484 non-null object
gender_male                   284484 non-null int64
gender_female                 284484 non-null int64
state                         284484 non-null object
rural_settlement              284484 non-null int64
urban_settlement              284484 non-null int64
report_date                   283112 non-null object
report_year                   284484 non-null int64
age                           284484 non-null int64
date_of_birth                 279426 non-null object
child_group                   284484 non-null int64
adult_group                   284484 non-null int64
disease                       284484 non-null object
cholera                       284484 non-null int64
diarrhoea                     284484 non-null int6

In [64]:
outbreaks.drop(columns=['serotype'], inplace=True)
outbreaks.dropna(inplace=True)
outbreaks.info()

KeyError: MemoryError()