### Tanzinian Water Pump Classification ###

### Data Cleaning

This notebook serves a purpose to show data cleaning for the training labels and values data. 
Here we will deal with null values, missing values, and outliers. 

In [133]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle 
import numpy as np

pd.set_option('precision', 4)
pd.options.display.max_seq_items = None


### Read in training labels and values: 

In [134]:
Y = pd.read_csv('TRAINING_LABELS.csv')
df = pd.read_csv('TRAINING_VALUES.csv', index_col = 0)

In [135]:
# Merge Y to the data frame after we looked at values below

df = df.merge(Y, on = 'id')

First, we look at the training values to assess the data we have and to get a feel for it. 

In [136]:
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.9381,-9.8563,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.6988,-2.1475,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.4607,-3.8213,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.4862,-11.1553,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.1308,-1.8254,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [137]:
df.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.1318,317.6504,668.2972,34.0774,-5.706,0.4741,15.297,5.6297,179.91,1300.6525
std,21453.1284,2997.5746,693.1164,6.5674,2.946,12.2362,17.5874,9.6336,471.4822,951.6205
min,0.0,0.0,-90.0,0.0,-11.649,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.0903,-8.5406,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.9087,-5.0216,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.1784,-3.3262,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.3452,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [138]:
df.shape

(59400, 41)

### Assessing our Target ###
Looking at the values below, we will want to handle this class imbalance later. 


In [139]:
Y.status_group.value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

### Dealing with NaN and Null Values; Checking for Duplicates

We have a lot of null values: funder, installer, subvillage, public_meeting, scheme_management,
    scheme_name, and permit. 


In [140]:
df.isnull().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

Let's deal with the null values first. 

### Funder

Fill in missing or null values with 'Unknown'

In [141]:
#df.funder.value_counts()

In [142]:
#df.installer.value_counts()

In [143]:
df['funder'] = df.funder.fillna('Unknown')
df['installer'] = df.installer.fillna('Unknown')

### Subvillage

In [144]:
#df.subvillage.value_counts()

In [145]:
#df.subvillage.isnull().sum()

In [146]:
freq_subvil = df.groupby(['region']).subvillage.apply(lambda x: x.value_counts().index[0])

In [147]:
df['subvillage'] = np.where(df.subvillage.isnull(), 
                            freq_subvil[df.region], 
                            df.subvillage)

### Public Meeting
Given True is the majority value for public meetings, we want to fill in True. 

In [148]:
#df.public_meeting.value_counts()

In [149]:
df['public_meeting'] = df.public_meeting.fillna(True)

### Scheme Management and Scheme Name 
Fill in the na values Unknown for managment and None for name. 

In [150]:
#df.scheme_management.value_counts()

In [151]:
#df.scheme_name.value_counts()

In [152]:
df['scheme_management'] = df.scheme_management.fillna('Unknown')
df['scheme_name'] = df.scheme_name.fillna('None')

### Permit

If highly class imbalanced, impute the majority, if not assign random bool. 

In [153]:
#df.permit.value_counts()

In [154]:
rand_choice = np.random.choice([True, False], df.permit.isnull().sum())

In [155]:
df['permit']= df.permit.mask(df.permit.isnull(), np.random.choice([True, False], size=len(df)))

In [156]:
df['permit'] = df.permit.astype('bool')

### Double Check for Nulls and Duplicates

In [157]:
df.isnull().sum()

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
subvillage               0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
recorded_by              0
scheme_management        0
scheme_name              0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
s

In [158]:
df[df.duplicated('id')]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group


 Null values have been removed. No duplicates were found. 
 While we don't have null values now, many of our values are not correct. 

### Data Abnormalities and Outliers


Several of our features seem to be a little off on the numbers. For construction year, these values have a lot of zero values This may take additional research so that imputed values have significance. 

In [159]:
df.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group'],
      dtype='object')

In [160]:
### Amount TSH 

In [161]:
#df['construction_year'].value_counts()

In [162]:
#df['date_recorded'].value_counts()

In [163]:
#list(df[df['construction_year'] == 0].date_recorded)

Of the construction year, we are missing nearly 35% of the data. 
This is a lot of information to drop or to inpute with an average value. 
We decided to keep this data as 0 and keep it in mind for consideration during feature engineering and selection.


In [164]:
len(df[df.construction_year == 0])/len(df)

0.34863636363636363

### Latitude and Longitude 


In [165]:
tmp = df.copy()
tmp = df[df.longitude > 5]
avg_lat_long = tmp.groupby('region')['latitude', 'longitude'].mean()

  This is separate from the ipykernel package so we can avoid doing imports until


In [166]:
df['latitude'] = np.where(df.longitude < 5, 
         avg_lat_long['latitude'][df.region], df.latitude)
df['longitude'] = np.where(df.longitude < 5, 
         avg_lat_long['longitude'][df.region], df.longitude)

### GPS Height

Seems that there's another grossly missing number in gps height. 

In [167]:
len(df[df.gps_height == 0]) /len(df)


0.3440740740740741

Many of the populations are listed as 0 which may mean that the wells are not located in populated areas and that people may have to travel from far away to get to them. 

In [168]:
df['population'].value_counts()

0       21381
1        7025
200      1940
150      1892
250      1681
        ...  
3241        1
1960        1
1685        1
2248        1
1439        1
Name: population, Length: 1049, dtype: int64

In [169]:
len(df[df.population == 0]) /len(df)

0.35994949494949496

### Funder


Many of these are misspelled or not combined well such as Babtest, Babtist, Baptist Church etc. 
Going through and correcting as many as possible. 

In [170]:
# Make All Funder Lowercase 
df['funder'] = df['funder'].str.lower()

In [171]:
cond = [
    df['funder'].isin(['christan outrich','cristan outrich', 'christian outrich', 'christian Outreach']),
    df['funder'].str.contains('bingo'), 
    df['funder'].str.contains('bukwang church saint'),
    df['funder'].str.contains('ces (gmbh)'),
    df['funder'].isin(['danida', 'dani ']),
    df['funder'].isin(['dasp', 'das ']), 
    df['funder'].str.contains('care'),
    df['funder'].str.contains('domestic rural'), 
    df['funder'].isin(['compa ', 'compassion']),
    df['funder'].str.contains('bs '),
    (df['funder'].str.contains('comunedi')),
    df['funder'].str.contains('cip'),
    df['funder'].isin(['commu', 'comm']),
    df['funder'].isin(['cg ', 'cg/rc', 'cgi']),
    df['funder'].isin(['conce', 'concen', 'concern'])
]
vals = ['christian outreach', 'bingo foundation germany', 'bukwang church saints', 'ces(gmbh)', 'danida', 'dasp ltd', 'care international', 
        'domestic rural development', 'compassion international', 'bsf', 'comune de roma', 'cipro', 'community', 'cgc', 'concern worldwide']
df['funder'] = np.select(cond, vals, df.funder)

  return func(self, *args, **kwargs)


In [172]:
cond = [
    df['funder'].isin(['fin water', 'fini water', 'finn water', 'finw', 'finwater', 'fiwater']),
    df['funder'].str.contains('egypt'), 
    df['funder'].str.contains('eung am'),
    df['funder'].str.contains('farm-africa'),
    df['funder'].str.contains('franc'),
    df['funder'].str.contains('fredked'), 
    df['funder'].str.contains('finland'),
    df['funder'].str.contains('germany'), 
    df['funder'].isin(['germany cristians', 'germany misionary', 'germany missionary']),
    df['funder'].str.contains('hotels and '),
    (df['funder'].isin(['holla ', 'holland'])),
    df['funder'].str.contains('irevea'),
    df['funder'].isin(['iran', 'irc', 'ir']),
    df['funder'].isin(['wug and ded', 'wua']),
    df['funder'].str.contains('zao water')
]
vals = ['fin water', 'egypt', 'eung am methodist church', 'farm africa', 'france', 'friedkin conservation fund', 'finland', 
        'germany', 'german missionaries', 'hotels and lodges tanzinia', 'holland', 'irevea sister water', 'iran', 'wua and ded', 'zao water spring']
df['funder'] = np.select(cond, vals, df.funder)

In [173]:
cond = [
    df['funder'].isin(['dhv ', 'dhinu', 'dhv/gove', 'dgv\\swiss', 'dhv\\nord', 'dh']),
    df['funder'].isin(['worldvision', 'world vision/rc church', 'world vision/adra', 'world vision/ kkkt', '/world vision']), 
    df['funder'].str.contains('world bank'),
    df['funder'].str.contains('farm-africa'),
    df['funder'].str.contains('women fo'),
    df['funder'].isin(['churc']), 
    df['funder'].isin(['hesaw', 'hesawa', 'hesawa', 'hesawwa', 'hesawz', 'hesawza', 'hesswa', 'hewasa', 'hewawa']),
    df['funder'].isin(['insititutiona', 'institutional', 'insututional']), 
    df['funder'].isin(['halimashau', 'halimashauli', 'halamashauli', 'halmashaur', 'halmashauri', 'halmashauri wil', 'halmashauri ya']),
    df['funder'].isin(['wateraid', 'water', 'water aid/', 'water aid', 'wate']),
    (df['funder'].isin(['vwt', 'vw', 'vwcvc', 'vwcvwc'])),
    df['funder'].str.contains('wamissionari wa kikatoriki'),
    df['funder'].isin(['village', 'villages', 'villlage', 'villegers', 'villagers', 'villa']),
    df['funder'].isin(['us', 'usa', 'u.s.a', 'u.s.', 'usaid', 'u.s']),
    df['funder'].isin(['totoland', 'tlc', 'tltc', 'totaland', 'total land'])
]
vals = ['dhv', 'world vision', 'world bank', 'farm africa', 'women for partnership', 'church', 'hesawa world wide', 'institution', 
        'halamashauri ya wilaya', 'water aid', 'vwc', 'wamisionari wa kikatoriki', 'village', 'united states of america', 'total land care']
df['funder'] = np.select(cond, vals, df.funder)

In [174]:
cond = [
    df['funder'].str.contains('tag'),
    df['funder'].isin(['swiss', 'swis', 'swash', 'swed', 'swif', 'swid']), 
    df['funder'].str.contains('rural'),
    df['funder'].isin(['rotary', 'rotery', 'rotte', 'rotaty']),
    df['funder'].isin(['rc', 'roman', 'cathoric', 'catholic' ]),
    df['funder'].isin(['quick', 'qwiqwi', 'qwick', 'qwekwin', 'qwckwin', 'quwkwin']), 
    df['funder'].isin(['pentekoste', 'pntecostal', 'pentecostal', 'pentecosta', 'pentecost']),
    df['funder'].isin(['private', 'priv']), 
    df['funder'].isin(['ox', '0x']),
    df['funder'].str.contains('oikos'),
    (df['funder'].str.contains('nora')),
    df['funder'].isin(['nethe', 'netherla', 'nerthlands', 'nethalan', 'nl']),
    df['funder'].isin(['nazalet', 'nazaleti', 'nazareth', 'nazerene']),
    df['funder'].str.contains('national rural'),
    df['funder'].str.contains('koico')
]
vals = ['tag church', 'sweden', 'rural water supply', 'rotary club', 'roman catholic church', 'quick win project', 'pentecostal church', 'private', 
        'oxfam', 'oikos', 'norad', 'netherlands', 'nazareth church', 'national rural', 'koica']
df['funder'] = np.select(cond, vals, df.funder)

In [175]:
cond = [
    df['funder'].isin(['missi', 'mission', 'miss']),
    df['funder'].str.contains('action a'), 
    df['funder'].str.contains('mkinga'),
    df['funder'].isin(['milinia', 'miliniem', 'millenium']),
    df['funder'].isin(['member of parlement', 'member of parlament', 'member of parliment']),
    df['funder'].isin(['maji', 'mugumu']), 
    df['funder'].str.contains('magadini-makiwaru'),
    df['funder'].isin(['makonde', 'makondakonde']), #will this take everything with something having de 
    df['funder'].isin(['luthe', 'lutheran']),
    df['funder'].str.contains('maro'),
    (df['funder'].str.contains('lake tanganyika')),
    df['funder'].str.contains('lion'),
    df['funder'].isin(['lottery', 'lotery', 'lotary']),
    df['funder'].isin(['lawate', 'lawatefuka', 'lawate-fuka']),
    df['funder'].isin(['Moravian', 'Moroil', 'Morovian', 'Morovian Church', 'Morrovian'])
]
vals = ['missionary', 'action aid', 'mkinga district council', 'millennium project', 'member of parliament', 'maju mugumu', 'magadini makiwaru water', 
        'makonde water supply', 'lutheran church', 'maro kyariga', 'lake anganyika basin', 'lions club', 'lottery club', 'lawate fuka water supply', 'Morovian Church']
df['funder'] = np.select(cond, vals, df.funder)

In [176]:
cond = [
    df['funder'].isin(['lgc', 'lgd']),
    df['funder'].str.contains('kanis'), 
    df['funder'].str.contains('livin'),
    df['funder'].str.contains('jeshi'),
    df['funder'].str.contains('japan'),
    df['funder'].str.contains('italy'), 
    df['funder'].str.contains('islam'),
    df['funder'].str.contains('isf'),
    df['funder'].str.contains('irish'),
    df['funder'].isin(['tansi','tanza'])
]
vals = ['l', 'kanisa', 'living water international', 'jeshi la wokovu', 'japan', 'italy', 'islamic society', 
        'isf', 'ireland', 'tanzania']
df['funder'] = np.select(cond, vals, df.funder)

In [178]:
cond = [
    df['funder'].isin(['adp','adp bungu', 'adp mombo', 'adp/w', 'magoma adp', 'moshono adp']),
    df['funder'].isin(['aic', 'aic church', 'aic kij', 'ai']), 
    df['funder'].str.contains('aict'),
    df['funder'].str.contains('arab'),
    df['funder'].str.contains('apm'),
    df['funder'].isin(['babtest', 'babtist', 'buptist', 'batist church', 'baptist church']), 
    df['funder'].str.contains('amref'),
    df['funder'].str.contains('bgss'),
    df['funder'].str.contains('africare'),
    df['funder'].str.contains('munic'),
    (df['funder'].str.contains('unic')) & (~df['funder'].str.contains('munic')),
    df['funder'].str.contains('bread'),
    df['funder'].str.contains('calt'),
    df['funder'].str.contains('canada'),
    df['funder'].str.contains('commu')
]
vals = ['adp', 'aic', 'aict', 'arab', 'apm', 'baptist_church', 'amref', 'bgss', 
       'africare', 'municipal_council', 'unicef', 'bread_of_the_world', 'caltas', 'canada', 
       'community']
df['funder'] = np.select(cond, vals, df.funder)

In [179]:
cond = [
    df['funder'].isin(['schoo', 'school']),
    df['funder'].str.contains('unhcr/'), 
    df['funder'].str.contains('tcrs'),
    df['funder'].isin(['tasaf', 'tasafu', 'tasafu', 'tasef', 'tasf', 'tassaf','tasae', 'tasad']),
    df['funder'].isin(['rwsp', 'rwsso', 'rwssp', 'rwsssp']),
    df['funder'].isin(['w.d.&.i.', 'w.d.& i.', 'w.d.&']), 
    df['funder'].isin(['kkkt', 'kkkt_makwale', 'kkkt_makwale']),
    df['funder'].isin(['de','ded', 'ded/rwssp', 'ded_rwsp']), #will this take everything with something having de 
    df['funder'].isin(['china', 'chinese']),
    df['funder'].str.contains('belgian'),
    (df['funder'].str.contains('dmk')),
    df['funder'].str.contains('balo'),
    df['funder'].isin(['dmd','dmdd', 'dmdd/solider', 'dmmd']),
    df['funder'].isin(['0', 'None']),
    df['funder'].isin(['Moravian', 'Moroil', 'Morovian', 'Morovian Church', 'Morrovian'])
]
vals = ['school', 'unhcr', 'tscrs', 'tasaf', 'rwssp', 'w.d.&i', 'kkkt', 'ded', 
       'china', 'belgium', 'dmk anglican', 'ballo', 'dmdd', 'Unknown', 
       'Morovian Church']
df['funder'] = np.select(cond, vals, df.funder)

In [180]:
df.to_pickle('clean_df.pkl')