### Tanzinian Water Pump Classification ###

### Data Cleaning

This notebook serves a purpose to show data cleaning for the training labels and values data. 
Here we will deal with null values, missing values, and outliers. 

In [203]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle 
import numpy as np

pd.set_option('precision', 4)
pd.options.display.max_seq_items = None


### Read in training labels and values: 

In [204]:
Y = pd.read_csv('TRAINING_LABELS.csv')
df = pd.read_csv('TRAINING_VALUES.csv', index_col = 0)

In [205]:
# Merge Y to the data frame after we looked at values below

df = df.merge(Y, on = 'id')

First, we look at the training values to assess the data we have and to get a feel for it. 

In [206]:
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.9381,-9.8563,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.6988,-2.1475,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.4607,-3.8213,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.4862,-11.1553,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.1308,-1.8254,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [207]:
df.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.1318,317.6504,668.2972,34.0774,-5.706,0.4741,15.297,5.6297,179.91,1300.6525
std,21453.1284,2997.5746,693.1164,6.5674,2.946,12.2362,17.5874,9.6336,471.4822,951.6205
min,0.0,0.0,-90.0,0.0,-11.649,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.0903,-8.5406,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.9087,-5.0216,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.1784,-3.3262,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.3452,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [208]:
df.shape

(59400, 41)

### Assessing our Target ###
Looking at the values below, we will want to handle this class imbalance later. 


In [209]:
Y.status_group.value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

### Dealing with NaN and Null Values; Checking for Duplicates

We have a lot of null values: funder, installer, subvillage, public_meeting, scheme_management,
    scheme_name, and permit. 


In [210]:
df.isnull().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

Let's deal with the null values first. 

### Funder

Fill in missing or null values with 'Unknown'

In [211]:
#df.funder.value_counts()

In [212]:
#df.installer.value_counts()

In [213]:
df['funder'] = df.funder.fillna('Unknown')
df['installer'] = df.installer.fillna('Unknown')

### Subvillage

In [214]:
#df.subvillage.value_counts()

In [215]:
#df.subvillage.isnull().sum()

In [216]:
freq_subvil = df.groupby(['region']).subvillage.apply(lambda x: x.value_counts().index[0])

In [217]:
df['subvillage'] = np.where(df.subvillage.isnull(), 
                            freq_subvil[df.region], 
                            df.subvillage)

### Public Meeting
Given True is the majority value for public meetings, we want to fill in True. 

In [218]:
#df.public_meeting.value_counts()

In [219]:
df['public_meeting'] = df.public_meeting.fillna(True)

### Scheme Management and Scheme Name 
Fill in the na values Unknown for managment and None for name. 

In [220]:
#df.scheme_management.value_counts()

In [221]:
#df.scheme_name.value_counts()

In [222]:
df['scheme_management'] = df.scheme_management.fillna('Unknown')
df['scheme_name'] = df.scheme_name.fillna('None')

### Permit

If highly class imbalanced, impute the majority, if not assign random bool. 

In [223]:
#df.permit.value_counts()

In [224]:
rand_choice = np.random.choice([True, False], df.permit.isnull().sum())

In [225]:
df['permit']= df.permit.mask(df.permit.isnull(), np.random.choice([True, False], size=len(df)))

In [226]:
df['permit'] = df.permit.astype('bool')

### Double Check for Nulls and Duplicates

In [227]:
df.isnull().sum()

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
subvillage               0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
recorded_by              0
scheme_management        0
scheme_name              0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
s

In [228]:
df[df.duplicated('id')]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group


 Null values have been removed. No duplicates were found. 
 While we don't have null values now, many of our values are not correct. 

### Data Abnormalities and Outliers


Several of our features seem to be a little off on the numbers. For construction year, these values have a lot of zero values This may take additional research so that imputed values have significance. 

In [229]:
df.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group'],
      dtype='object')

In [230]:
### Amount TSH 

In [231]:
#df['construction_year'].value_counts()

In [232]:
#df['date_recorded'].value_counts()

In [233]:
#list(df[df['construction_year'] == 0].date_recorded)

Of the construction year, we are missing nearly 35% of the data. 
This is a lot of information to drop or to inpute with an average value. 
We decided to keep this data as 0 and keep it in mind for consideration during feature engineering and selection.


In [234]:
len(df[df.construction_year == 0])/len(df)

0.34863636363636363

### Latitude and Longitude 


In [235]:
tmp = df.copy()
tmp = df[df.longitude > 5]
avg_lat_long = tmp.groupby('region')['latitude', 'longitude'].mean()

  This is separate from the ipykernel package so we can avoid doing imports until


In [236]:
df['latitude'] = np.where(df.longitude < 5, 
         avg_lat_long['latitude'][df.region], df.latitude)
df['longitude'] = np.where(df.longitude < 5, 
         avg_lat_long['longitude'][df.region], df.longitude)

### GPS Height

Seems that there's another grossly missing number in gps height. 

In [237]:
len(df[df.gps_height == 0]) /len(df)


0.3440740740740741

Many of the populations are listed as 0 which may mean that the wells are not located in populated areas and that people may have to travel from far away to get to them. 

In [238]:
df['population'].value_counts()

0       21381
1        7025
200      1940
150      1892
250      1681
        ...  
3241        1
1960        1
1685        1
2248        1
1439        1
Name: population, Length: 1049, dtype: int64

In [239]:
len(df[df.population == 0]) /len(df)

0.35994949494949496

### Funder


Many of these are misspelled or not combined well such as Babtest, Babtist, Baptist Church etc. 
Going through and correcting as many as possible. 

In [240]:
# Make All Funder Lowercase 
df['funder'] = df['funder'].str.lower()

In [241]:

#  'Adp','Adp Bungu', 'Adp Mombo', 'Adp/w',
df.loc[df['funder'].str.contains('adp'), 'funder'] = 'adp'

# 'Afric', 'Africa','African',
df.loc[df['funder'].str.contains('afric'), 'funder'] = 'africa'
df.loc[df['funder'].str.contains('african'), 'funder'] = 'africa'

#   'Ai','Aic', 'Aic Church', 'Aic Kij', 'Aict',
df.loc[df['funder'].str.contains('aic'), 'funder'] = 'aic'
df.loc[df['funder'].str.contains('ai'), 'funder'] = 'aic'
df.loc[df['funder'].str.contains('aict'), 'funder'] = 'aic'

# 'Amref', 'Amrefe',
df.loc[df['funder'].str.contains('amrefe'), 'funder'] = 'amref'

# 'Apm', 'Apm[africa Precious Metals Lt',
df.loc[df['funder'].str.contains('apm'), 'funder'] = 'apm'
df.loc[df['funder'].str.contains('apm[africa]'), 'funder'] = 'apm'

#  'Arab Community', 'Arabi', 'Arabs Community',
df.loc[df['funder'].str.contains('arabi'), 'funder'] = 'arab community'
df.loc[df['funder'].str.contains('arab'), 'funder'] = 'arab community'

#  'Babtest','Babtist', 'Baptist Church', 'Batist Church', 'Buptist'
df.loc[df['funder'].str.contains('babtest'), 'funder'] = 'baptist church'
df.loc[df['funder'].str.contains('babtist'), 'funder'] = 'baptist church'
df.loc[df['funder'].str.contains('batist'), 'funder'] = 'baptist church'
df.loc[df['funder'].str.contains('buptist'), 'funder'] = 'baptist church'

#  'Bgss', 'Bgssws',
df.loc[df['funder'].str.contains('bgssws'), 'funder'] = 'bgss'

#  'Bingo Foundation','Bingo Foundation Germany',
df.loc[df['funder'].str.contains('bingo'), 'funder'] = 'bingo foundation germany'

#  'Bread For The Wor', 'Bread Of The Worl',
df.loc[df['funder'].str.contains('bread for'), 'funder'] = 'bread of the world'
df.loc[df['funder'].str.contains('bread of'), 'funder'] = 'bread of the world'

#  'Bs', 'Bsf',
df.loc[df['funder'].str.contains('bs'), 'funder'] = 'bsf'

# 'Bukwang Church Saint','Bukwang Church Saints',
df.loc[df['funder'].str.contains('bukwang church saint'), 'funder'] = 'bukwang church saints'

#  'Caltas','Caltas Tanzania','Caltaz Kahama', 'Caltus'
df.loc[df['funder'].str.contains('caltas'), 'funder'] = 'caltas'
df.loc[df['funder'].str.contains('caltus'), 'funder'] = 'caltas'
df.loc[df['funder'].str.contains('caltaz'), 'funder'] = 'caltas'

#  'Canada','Canada Aid',
df.loc[df['funder'].str.contains('canada'), 'funder'] = 'canada'

# 'Care Int', 'Care International', 'Care/cipro', 'Care/dwe',
df.loc[df['funder'].str.contains('care'), 'funder'] = 'care international'

#  'Ces (gmbh)','Ces(gmbh)',
df.loc[df['funder'].str.contains('ces (gmbh)'), 'funder'] = 'ces(gmbh)'

# 'Cg', 'Cg/rc', 'Cgc', 'Cgi',
df.loc[df['funder'].str.contains('cg'), 'funder'] = 'cgc'
df.loc[df['funder'].str.contains('cg/rc'), 'funder'] = 'cgc'
df.loc[df['funder'].str.contains('cgi'), 'funder'] = 'cgc'

#  'Christan Outrich', 'Christian Outrich', 'Chuo', 'Churc', 'Church', 'Cristan Outrich',
df.loc[df['funder'].str.contains('christan outrich'), 'funder'] = 'christian outreach'
df.loc[df['funder'].str.contains('chuo'), 'funder'] = 'christian outreach'
df.loc[df['funder'].str.contains('churc'), 'funder'] = 'christian outreach'
df.loc[df['funder'].str.contains('church'), 'funder'] = 'christian outreach'
df.loc[df['funder'].str.contains('cristan outrich'), 'funder'] = 'christian outreach'
df.loc[df['funder'].str.contains('christian outrich'), 'funder'] = 'christian outreach'
df.loc[df['funder'].str.contains('christian Outreach'), 'funder'] = 'christian outreach'


#  'Cip', 'Cipro', 'Cipro/care', 'Cipro/care/tcrs', 'Cipro/government',
df.loc[df['funder'].str.contains('cip'), 'funder'] = 'cipro'
df.loc[df['funder'].str.contains('cipro'), 'funder'] = 'cipro'
df.loc[df['funder'].str.contains('cipro/care'), 'funder'] = 'cipro'
df.loc[df['funder'].str.contains('cipro/care/tcrs'), 'funder'] = 'cipro'
df.loc[df['funder'].str.contains('cipro/government'), 'funder'] = 'cipro'
df.loc[df['funder'].str.contains('cipo'), 'funder'] = 'cipro'

# 'Conce', 'Concen', 'Concern', 'Concern /govern', 'Concern World Wide', 'Concern/governm',
# 'Co', 'Cocen', 'Cocern',
df.loc[df['funder'].str.contains('co'), 'funder'] = 'concern worldwide'
df.loc[df['funder'].str.contains('cocen'), 'funder'] = 'concern worldwide'
df.loc[df['funder'].str.contains('conce'), 'funder'] = 'concern worldwide'
df.loc[df['funder'].str.contains('concern'), 'funder'] = 'concern worldwide'
df.loc[df['funder'].str.contains('concern /govern'), 'funder'] = 'concern worldwide'
df.loc[df['funder'].str.contains('concern/governm'), 'funder'] = 'concern worldwide'
df.loc[df['funder'].str.contains('concern'), 'funder'] = 'concern worldwide'

#'Commu','Community',
df.loc[df['funder'].str.contains('commu'), 'funder'] = 'community'
df.loc[df['funder'].str.contains('comm'), 'funder'] = 'community'

# 'Compa', 'Compasion International',
df.loc[df['funder'].str.contains('compa'), 'funder'] = 'compassion international'
df.loc[df['funder'].str.contains('compasion'), 'funder'] = 'compassion international'

# 'Comune Di Roma','Comunedi Roma',
df.loc[df['funder'].str.contains('comunedi'), 'funder'] = 'comune de roma'

# 'Dasp', 'Dasp Ltd',
df.loc[df['funder'].str.contains('dasp'), 'funder'] = 'dasp ltd'
df.loc[df['funder'].str.contains('das'), 'funder'] = 'dasp ltd'

#  'Danida', 'Danida /government',
df.loc[df['funder'].str.contains('dani'), 'funder'] = 'danida'
df.loc[df['funder'].str.contains('danida /government'), 'funder'] = 'danida'

# 'Domestic Rural Development Pr', 'Domestic Rural Development Pro',
df.loc[df['funder'].str.contains('domestic rural'), 'funder'] = 'domestic rural development'

#  'Dgv', 'Dh', 'Dhinu', 'Dhv', 'Dhv Moro', 'Dhv/gove', 'Dhv\\norp', 'Dhv\\swis',
df.loc[df['funder'].str.contains('dgv'), 'funder'] = 'dhv'
df.loc[df['funder'].str.contains('dh'), 'funder'] = 'dhv'
df.loc[df['funder'].str.contains('dhinu'), 'funder'] = 'dhv'
df.loc[df['funder'].str.contains('dhv moro'), 'funder'] = 'dhv'
df.loc[df['funder'].str.contains('dhv/gove'), 'funder'] = 'dhv'
df.loc[df['funder'].str.contains('dhv\\norp'), 'funder'] = 'dhv'
df.loc[df['funder'].str.contains('dgv\\swis'), 'funder'] = 'dhv'

# 'Egypt Government', 'Egypt Technical Co Operation',
df.loc[df['funder'].str.contains('egypt'), 'funder'] = 'egypt'

#  'Eung Am Methodist Church', 'Eung-am Methodist Church',
df.loc[df['funder'].str.contains('eung am'), 'funder'] = 'eung am methodist church'

# 'Farm Africa', 'Farm-africa',
df.loc[df['funder'].str.contains('farm-africa'), 'funder'] = 'farm africa'

#  'Fin Water', 'Fini Water', 'Finn Water', 'Finw', 'Finwater', 'Fiwater',
df.loc[df['funder'].str.contains('fin'), 'funder'] = 'fin water'
df.loc[df['funder'].str.contains('fini'), 'funder'] = 'fin water'
df.loc[df['funder'].str.contains('finn water'), 'funder'] = 'fin water'
df.loc[df['funder'].str.contains('finw'), 'funder'] = 'fin water'
df.loc[df['funder'].str.contains('finwater'), 'funder'] = 'fin water'
df.loc[df['funder'].str.contains('fiwater'), 'funder'] = 'fin water'

# 'Finland', 'Finland Government',
df.loc[df['funder'].str.contains('finland'), 'funder'] = 'finland'

#  'Franc','France',
df.loc[df['funder'].str.contains('franc'), 'funder'] = 'france'

#  'Fredked Conservation', 'Friedkin Conservation Fund',
df.loc[df['funder'].str.contains('fredked'), 'funder'] = 'friedkin conservation fund'

#  'Germany', 'Germany Republi',
df.loc[df['funder'].str.contains('germany'), 'funder'] = 'germany'

# 'Germany Cristians','Germany Misionary', 'Germany Missionary',
df.loc[df['funder'].str.contains('germany cristians'), 'funder'] = 'german missionaries'
df.loc[df['funder'].str.contains('germany misionary'), 'funder'] = 'german missionaries'
df.loc[df['funder'].str.contains('germany missionary'), 'funder'] = 'german missionaries'

#  'Hesaw', 'Hesawa', 'Hesawa And Concern World Wide', 'Hesawwa', 'Hesawz', 'Hesawza', 'Hesswa', 'Hewasa', 'Hewawa',
df.loc[df['funder'].str.contains('hesaw'), 'funder'] = 'hesawa world wide'
df.loc[df['funder'].str.contains('hesawa'), 'funder'] = 'hesawa world wide'
df.loc[df['funder'].str.contains('hesawa'), 'funder'] = 'hesawa world wide'
df.loc[df['funder'].str.contains('hesawwa'), 'funder'] = 'hesawa world wide'
df.loc[df['funder'].str.contains('hesawz'), 'funder'] = 'hesawa world wide'
df.loc[df['funder'].str.contains('hesawza'), 'funder'] = 'hesawa world wide'
df.loc[df['funder'].str.contains('hesswa'), 'funder'] = 'hesawa world wide'
df.loc[df['funder'].str.contains('hewasa'), 'funder'] = 'hesawa world wide'
df.loc[df['funder'].str.contains('hewawa'), 'funder'] = 'hesawa world wide'

#  'Hotels And Lodge Tanzania', 'Hotels And Loggs Tz Ltd',
df.loc[df['funder'].str.contains('hotels and'), 'funder'] = 'hotels and lodges tanzania'

#  'Holand', 'Holla', 'Holland',
df.loc[df['funder'].str.contains('holand'), 'funder'] = 'holland'
df.loc[df['funder'].str.contains('holla'), 'funder'] = 'holland'

#  'Halimashau','Halimashauli','Halmashauli','Halmashaur','Halmashauri', 'Halmashauri Wil' 
#'Halmashauri Ya Manispa Tabora', 'Halmashauri Ya Wilaya', 'Halmashauri Ya Wilaya Sikonge',
df.loc[df['funder'].str.contains('halimashau'), 'funder'] = 'halamashauri ya wilaya'
df.loc[df['funder'].str.contains('halimashauli'), 'funder'] = 'halamashauri ya wilaya'
df.loc[df['funder'].str.contains('halamashauli'), 'funder'] = 'halamashauri ya wilaya'
df.loc[df['funder'].str.contains('halmashaur'), 'funder'] = 'halamashauri ya wilaya'
df.loc[df['funder'].str.contains('halmashauri'), 'funder'] = 'halamashauri ya wilaya'
df.loc[df['funder'].str.contains('halmashauri wil'), 'funder'] = 'halamashauri ya wilaya'
df.loc[df['funder'].str.contains('halmashauri ya'), 'funder'] = 'halamashauri ya wilaya'

# 'Insititutiona', 'Institution', 'Institutional', 'Insututional', 
df.loc[df['funder'].str.contains('insititutiona'), 'funder'] = 'institution'
df.loc[df['funder'].str.contains('institution'), 'funder'] = 'institution'
df.loc[df['funder'].str.contains('institutional'), 'funder'] = 'institution'
df.loc[df['funder'].str.contains('insututional'), 'funder'] = 'institution'

#'Ir','Iran Gover', 'Irc',
df.loc[df['funder'].str.contains('iran gover'), 'funder'] = 'iran'
df.loc[df['funder'].str.contains('irc'), 'funder'] = 'iran'
df.loc[df['funder'].str.contains('ir'), 'funder'] = 'iran'

# 'Irevea Sister', 'Irevea Sister Water
df.loc[df['funder'].str.contains('irevea'), 'funder'] = 'irevea sister water'

#  'Zao Water Spring X', 'Zao Water Spring',
df.loc[df['funder'].str.contains('zao water'), 'funder'] = 'zao water spring'

# 'Wug And Ded', 'Wua And Ded', 'Wua'
df.loc[df['funder'].str.contains('wug and ded'), 'funder'] = 'wua and ded'
df.loc[df['funder'].str.contains('wua'), 'funder'] = 'wua and ded'

#  'Worldvision', 'World Vision/rc Church', 'World Vision/adra', 'World Vision/ Kkkt', 'World Vision',

df.loc[df['funder'].str.contains('worldvision'), 'funder'] = 'world vision'
df.loc[df['funder'].str.contains('world vision/rc church'), 'funder'] = 'world vision'
df.loc[df['funder'].str.contains('world vision/adra'), 'funder'] = 'world vision'
df.loc[df['funder'].str.contains('world vision/ kkkt'), 'funder'] = 'world vision'

#  'World Bank/government','World Bank',
df.loc[df['funder'].str.contains('world bank'), 'funder'] = 'world bank'
df.loc[df['funder'].str.contains('world bank/government'), 'funder'] = 'world bank'

# 'Women For Partnership', 'Women Fo Partnership',
df.loc[df['funder'].str.contains('women for partnership'), 'funder'] = 'women for partnership'
df.loc[df['funder'].str.contains('women fo'), 'funder'] = 'women for partnership'
df.loc[df['funder'].str.contains('women partnership'), 'funder'] = 'women for partnership'

#  'Wateraid'  'Water Aid/sema', 'Water Aid/dwe', 'Water Aid /sema', 'Water /sema', 'Water', 'Wate Aid/sema',
#  'Wanginyi Water', 'Wananchi', 'Wanakijiji',
df.loc[df['funder'].str.contains('wateraid'), 'funder'] = 'water aid'
df.loc[df['funder'].str.contains('water aid/sema'), 'funder'] = 'water aid'
df.loc[df['funder'].str.contains('water aid/dwe'), 'funder'] = 'water aid'
df.loc[df['funder'].str.contains('water aid / sema'), 'funder'] = 'water aid'
df.loc[df['funder'].str.contains('water / sema'), 'funder'] = 'water aid'
df.loc[df['funder'].str.contains('water'), 'funder'] = 'water aid'
df.loc[df['funder'].str.contains('wate'), 'funder'] = 'water aid'

# 'Wamissionari Wa Kikatoriki', 'Wamisionari Wa Kikatoriki',
df.loc[df['funder'].str.contains('wamissionari wa kikatoriki'), 'funder'] = 'wamisionari wa kikatoriki'

#  'Vwt','Vwcvwc', 'Vwcvc','Vwc', 'Vw',
df.loc[df['funder'].str.contains('vwt'), 'funder'] = 'vwc'
df.loc[df['funder'].str.contains('vw'), 'funder'] = 'vwc'
df.loc[df['funder'].str.contains('vwcvc'), 'funder'] = 'vwc'
df.loc[df['funder'].str.contains('vwcvwc'), 'funder'] = 'vwc'


  return func(self, *args, **kwargs)


In [242]:
#  'Villlage Contributi', 'Villegers', 'Villege Council', 'Villages', 'Villagers Mpi', 
#'Villagers', 'Village Water Commission', 'Village Res', 'Village Office', 'Village Govt', 'Village Government' 
#'Village Fund', 'Village Council/ Rose Kawala', 'Village Council/ Haydom Luther', 'Village Council', 'Village Contributio', 
#'Village Community', 'Village Communi', 'Village', 'Villaers', 'Villa',

df.loc[df['funder'].str.contains('village'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('villlage'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('villages'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('villagers mpi'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('villege council'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('villegers'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('villagers'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('village water commission'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('village council/ '), 'funder'] = 'village'
df.loc[df['funder'].str.contains('villa'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('villaers'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('village communi'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('village contributio'), 'funder'] = 'village'
df.loc[df['funder'].str.contains('village council'), 'funder'] = 'village'

# 'Usaid/wfp', 'Usaid', 'Usa Embassy', 'Us Embassy',
df.loc[df['funder'].str.contains('usa'), 'funder'] = 'united states of america'
df.loc[df['funder'].str.contains('usaid/wfp'), 'funder'] = 'united states of america'
df.loc[df['funder'].str.contains('usaid'), 'funder'] = 'united states of america'
df.loc[df['funder'].str.contains('us embassy'), 'funder'] = 'united states of america'
df.loc[df['funder'].str.contains('us'), 'funder'] = 'united states of america'
df.loc[df['funder'].str.contains('u.s.a'), 'funder'] = 'united states of america'
df.loc[df['funder'].str.contains('u.s'), 'funder'] = 'united states of america'
df.loc[df['funder'].str.contains('u.s.'), 'funder'] = 'united states of america'

#  'Uniseg', 'Unicrf', 'Unicet', 'Uniceg', 'Unicef/cspd', 'Unicef/central', 'Unicef/african Muslim Agency', 'Unicef/ Csp', 'Unicef', 'Unice/ Cspd', 'Unice',
df.loc[df['funder'].str.contains('unicrf'), 'funder'] = 'unicef'
df.loc[df['funder'].str.contains('uniseg'), 'funder'] = 'unicef'
df.loc[df['funder'].str.contains('unicet'), 'funder'] = 'unicef'
df.loc[df['funder'].str.contains('uniceg'), 'funder'] = 'unicef'
df.loc[df['funder'].str.contains('unicef/cspd'), 'funder'] = 'unicef'
df.loc[df['funder'].str.contains('unicef/central'), 'funder'] = 'unicef'
df.loc[df['funder'].str.contains('unicef/african'), 'funder'] = 'unicef'
df.loc[df['funder'].str.contains('unicef/ '), 'funder'] = 'unicef'
df.loc[df['funder'].str.contains('unice/ cspd'), 'funder'] = 'unicef'
df.loc[df['funder'].str.contains('unice'), 'funder'] = 'unicef'

In [243]:
#  'Totoland Care', 'Totoland', 'Totaland Care', 'Total Landcare', 'Total Land Care'
# 'Tltc', 'Tlc/thimotheo Masunga', 'Tlc/sorri', "Tlc/seleman Mang'ombe", 'Tlc/samora', 'Tlc/nyengesa Masanja', 'Tlc/john Majala', 
#'Tlc/jenus Malecha','Tlc/emmanuel Kasoga', 'Tlc/community', 'Tlc',

df.loc[df['funder'].str.contains('totoland'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('totaland'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('total landcare'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tltc'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc/thimotheo'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc/sorri'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc/seleman'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc/samora'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc/nyengesa'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc/john'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc/jenus'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc/emmanuel'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc/community'), 'funder'] = 'total land care'
df.loc[df['funder'].str.contains('tlc/'), 'funder'] = 'total land care'

#  'Tag Church Ub', 'Tag Church', 'Tag',
df.loc[df['funder'].str.contains('tag'), 'funder'] = 'tag church'

#  'Swiss Tr', 'Swiss If', 'Swisland/mount Meru Flowers', 'Swisland/ Mount Meru Flowers', 'Swifti', 'Swidish', 'Sweeden', 'Swedish', 'Sweden', 'Swash',
df.loc[df['funder'].str.contains('swiss'), 'funder'] = 'switzerland'
df.loc[df['funder'].str.contains('swisland/mount'), 'funder'] = 'switzerland'
df.loc[df['funder'].str.contains('swisland/ '), 'funder'] = 'switzerland'
df.loc[df['funder'].str.contains('swifti'), 'funder'] = 'switzerland'
df.loc[df['funder'].str.contains('swidish'), 'funder'] = 'switzerland'
df.loc[df['funder'].str.contains('sweeden'), 'funder'] = 'switzerland'
df.loc[df['funder'].str.contains('sweden'), 'funder'] = 'switzerland'
df.loc[df['funder'].str.contains('swash'), 'funder'] = 'switzerland'

#  Rural Water Supply And Sanitat','Rural Water Supply And Sanita','Rural Water Supply', 'Rural Water Department', 'Rural Drinking Water Supply', 'Rural',
df.loc[df['funder'].str.contains('rural water supply'), 'funder'] = 'rural water supply'
df.loc[df['funder'].str.contains('rural water'), 'funder'] = 'rural water supply'
df.loc[df['funder'].str.contains('rural drinking'), 'funder'] = 'rural water supply'
df.loc[df['funder'].str.contains('rural'), 'funder'] = 'rural water supply'

#  'Rotte', 'Rotery C', 'Rotaty Club', 'Rotary I', 'Rotary Club Of Usa And Moshi', 'Rotary Club Of Chico And Moshi', 'Rotary Club Kitchener', 'Rotary Club Australia', 'Rotary Club', 'Rotary',
df.loc[df['funder'].str.contains('rotary'), 'funder'] = 'rotary club'
df.loc[df['funder'].str.contains('rotery'), 'funder'] = 'rotary club'
df.loc[df['funder'].str.contains('rotary'), 'funder'] = 'rotary club'
df.loc[df['funder'].str.contains('rotte'), 'funder'] = 'rotary club'

# 'Roman Church', 'Roman Cathoric-same', 'Roman Cathoric Same', 'Roman Cathoric Church', 'Roman Cathoric -kilomeni',
#'Roman Cathoric', 'Roman Catholic Rulenge Diocese', 'Roman Catholic', 'Roman Ca', 'Roman', 'Romam Catholic',
#  'Cathoric', 'Rcchurch/cefa',
#'Rc/mission', 'Rc/dwe', 'Rc Njoro', 'Rc Msufi', 'Rc Mofu', 'Rc Missionary', 'Rc Mission', 'Rc Missi', 'Rc Mi',
# 'Rc Church/centr','Rc Church', 'Rc Churc', 'Rc Ch', 'Rc Cathoric', 'Rc',
df.loc[df['funder'].str.contains('rc '), 'funder'] = 'roman catholic church'
df.loc[df['funder'].str.contains('rc'), 'funder'] = 'roman catholic church'
df.loc[df['funder'].str.contains('rc/mission'), 'funder'] = 'roman catholic church'
df.loc[df['funder'].str.contains('roman'), 'funder'] = 'roman catholic church'
df.loc[df['funder'].str.contains('cathoric'), 'funder'] = 'roman catholic church'
df.loc[df['funder'].str.contains('rcchurch/cefa'), 'funder'] = 'roman catholic church'
df.loc[df['funder'].str.contains('catholic'), 'funder'] = 'roman catholic church'
df.loc[df['funder'].str.contains('cathoric-same'), 'funder'] = 'roman catholic church'


#  'Qwiqwi', 'Qwickwin', 'Qwick Win', 'Qwekwin', 'Qwckwin', 'Quwkwin', 'Quik', 'Quickwins', 'Quickwi', 'Quicklw', 'Quick Wins Scheme', 
#'Quick Wins', 'Quick Wings','Quick Win/halmashauri', 'Quick Win Project /council' 'Quick Win Project', 'Quick Win', 'Quick',
df.loc[df['funder'].str.contains('qwiqwi'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('qwick'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('qwekwin'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('qwckwin'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('quwkwin'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('quik'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('quickwins'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('quickwi'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('quicklw'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('quick wins scheme'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('quick win'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('quick'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('quick wings'), 'funder'] = 'quick win project'
df.loc[df['funder'].str.contains('quick win/halamashauri'), 'funder'] = 'quick win project'


#  'Pentekoste', 'Pentecostal Hagana Sweeden', 'Pentecostal Church', 'Pentecostal', 'Pentecosta Seela', 'Pentecosta Church','Pentecost',
df.loc[df['funder'].str.contains('pentekoste'), 'funder'] = 'pentecostal church'
df.loc[df['funder'].str.contains('pntecostal hagana sweeden'), 'funder'] = 'pentecostal church'
df.loc[df['funder'].str.contains('pentecostal'), 'funder'] = 'pentecostal church'
df.loc[df['funder'].str.contains('pentecosta'), 'funder'] = 'pentecostal church'
df.loc[df['funder'].str.contains('pentecost'), 'funder'] = 'pentecostal church'

# 'Private Person', 'Private Owned', 'Private Institutions', 'Private Individul', 'Private Individual', 'Private Co', 'Private', 'Priva',
df.loc[df['funder'].str.contains('private'), 'funder'] = 'private'
df.loc[df['funder'].str.contains('priva'), 'funder'] = 'private'
df.loc[df['funder'].str.contains('priv'), 'funder'] = 'private'

#  'Oxfarm Gb','Oxfarm','Oxfam Gb', 'Oxfam', 'Ox',
df.loc[df['funder'].str.contains('ox'), 'funder'] = 'oxfam'
df.loc[df['funder'].str.contains('oxfam'), 'funder'] = 'oxfam'
df.loc[df['funder'].str.contains('oxfarm'), 'funder'] = 'oxfam'
df.loc[df['funder'].str.contains('0x'), 'funder'] = 'oxfam'
df.loc[df['funder'].str.contains('0xfam'), 'funder'] = 'oxfam'

#  'Oikos E.Afrika', 'Oikos E.Africa/european Union', 'Oikos E.Africa/ European Union', 'Oikos E .Africa/european Union', 'Oikos',
df.loc[df['funder'].str.contains('oikos'), 'funder'] = 'oikos'

#  'Norad/rudep', 'Norad/japan', 'Norad/government', 'Norad/ Tassaf Ii', 'Norad/ Tassaf', 'Norad/ Kidep', 'Norad /government', 'Norad',
df.loc[df['funder'].str.contains('norad'), 'funder'] = 'norad'
df.loc[df['funder'].str.contains('norad/rudep'), 'funder'] = 'norad'
df.loc[df['funder'].str.contains('norad/japan'), 'funder'] = 'norad'
df.loc[df['funder'].str.contains('norad/government'), 'funder'] = 'norad'
df.loc[df['funder'].str.contains('norad/'), 'funder'] = 'norad'
df.loc[df['funder'].str.contains('norad/ '), 'funder'] = 'norad'
df.loc[df['funder'].str.contains('nora'), 'funder'] = 'norad'

#  'Netherlands', 'Netherland', 'Netherla', 'Nethe', 'Nethalan', 'Nerthlands', 'Natherland',
df.loc[df['funder'].str.contains('netherland'), 'funder'] = 'netherlands'
df.loc[df['funder'].str.contains('netherla'), 'funder'] = 'netherlands'
df.loc[df['funder'].str.contains('nethe'), 'funder'] = 'netherlands'
df.loc[df['funder'].str.contains('nethalan'), 'funder'] = 'netherlands'
df.loc[df['funder'].str.contains('nerthlands'), 'funder'] = 'netherlands'
df.loc[df['funder'].str.contains('natherland'), 'funder'] = 'netherlands'
df.loc[df['funder'].str.contains('nl'), 'funder'] = 'netherlands'

# 'Nazareth Church', 'Nazaleti', 'Nazalet Church',
df.loc[df['funder'].str.contains('nazalet'), 'funder'] = 'nazareth church'
df.loc[df['funder'].str.contains('nazaleti'), 'funder'] = 'nazareth church'
df.loc[df['funder'].str.contains('nazareth'), 'funder'] = 'nazareth church'
df.loc[df['funder'].str.contains('nazerene'), 'funder'] = 'nazareth church'

# 'National Rural And Hfa', 'National Rural (wb)', 'National Rural',
df.loc[df['funder'].str.contains('national rural'), 'funder'] = 'national rural'


In [244]:
# Action in A to Action Aid
df.loc[df['funder'].str.contains('action a'), 'funder'] = 'action aid'

# Missi, Missio, Mission, Missionary
df.loc[df['funder'].str.contains('missi'), 'funder'] = 'missionary'
df.loc[df['funder'].str.contains('mission'), 'funder'] = 'missionary'
df.loc[df['funder'].str.contains('miss'), 'funder'] = 'missionary'

# Mkinga Distric Cou and Coun need to be the same
df.loc[df['funder'].str.contains('mkinga'), 'funder'] = 'mkinga district council'

# Moradi and Morad
df.loc[df['funder'].str.contains('moradi'), 'funder'] = 'morad'

# Milinia, Miliniem Project, Millenium
df.loc[df['funder'].str.contains('milinia'), 'funder'] = 'millennium project'
df.loc[df['funder'].str.contains('miliniem'), 'funder'] = 'millennium project'
df.loc[df['funder'].str.contains('millenium'), 'funder'] = 'millennium project'

# Member of Parliment and Member of Parlement 
df.loc[df['funder'].str.contains('member of parlement'), 'funder'] = 'member of parliament'
df.loc[df['funder'].str.contains('member of parlament'), 'funder'] = 'member of parliament'
df.loc[df['funder'].str.contains('member of parliment'), 'funder'] = 'member of parliament'

# 'Maro' and 'Maro Kyariga',
df.loc[df['funder'].str.contains('maro'), 'funder'] = 'maro kyariga'

# 'Maji Mugumu','Maju Mugumu'
df.loc[df['funder'].str.contains('maji'), 'funder'] = 'maju mugumu'
df.loc[df['funder'].str.contains('mugumu'), 'funder'] = 'maju mugumu'

#  'Magadini Makiwaru Water' and 'Magadini-makiwaru Water',
df.loc[df['funder'].str.contains('magadini-makiwaru'), 'funder'] = 'magadini makiwaru water'

# 'Makondakonde Water Population','Makonde','Makonde Water Population','Makonde Water Supply'
df.loc[df['funder'].str.contains('makonde'), 'funder'] = 'makonde water supply'
df.loc[df['funder'].str.contains('makondakonde'), 'funder'] = 'magadini makiwaru water'

# 'Luthe', 'Lutheran','Lutheran Church',
df.loc[df['funder'].str.contains('lutheran'), 'funder'] = 'lutheran church'
df.loc[df['funder'].str.contains('luthe'), 'funder'] = 'lutheran church'

#  'Lgcbg','Lgcd','Lgcdg','Lgcgd','Lgdbg','Lgdcg','Lc','Lcdg', 'Lcgd', 'Ldcdd','Ldcgd',
df.loc[df['funder'].str.contains('lgcbg'), 'funder'] = 'l'
df.loc[df['funder'].str.contains('lgcd'), 'funder'] = 'l'
df.loc[df['funder'].str.contains('lgcdg'), 'funder'] = 'l'
df.loc[df['funder'].str.contains('lgcgd'), 'funder'] = 'l'
df.loc[df['funder'].str.contains('lgdbg'), 'funder'] = 'l'
df.loc[df['funder'].str.contains('lgdcg'), 'funder'] = 'l'
df.loc[df['funder'].str.contains('lc'), 'funder'] = 'l'
df.loc[df['funder'].str.contains('lcdg'), 'funder'] = 'l'
df.loc[df['funder'].str.contains('lcgd'), 'funder'] = 'l'
df.loc[df['funder'].str.contains('ldcdd'), 'funder'] = 'l'
df.loc[df['funder'].str.contains('ldcgd'), 'funder'] = 'l'


# 'Lotary Club','Lotary International','Lottery','Lottery Club',
df.loc[df['funder'].str.contains('lottery'), 'funder'] = 'lottery club'
df.loc[df['funder'].str.contains('lotary'), 'funder'] = 'lottery club'
df.loc[df['funder'].str.contains('lotery'), 'funder'] = 'lottery club'

#  'Lawate Fuka Water Suppl','Lawatefuka Water Supply',
df.loc[df['funder'].str.contains('lawate'), 'funder'] = 'lawate fuka water supply'
df.loc[df['funder'].str.contains('lawatefuka'), 'funder'] = 'lawate fuka water supply'
df.loc[df['funder'].str.contains('lawate-fuka'), 'funder'] = 'lawate fuka water supply'
df.loc[df['funder'].str.contains('lawate fuka'), 'funder'] = 'lawate fuka water supply'


# 'Lion Clu', 'Lions','Lions C','Lions Club', 'Lions Club Kilimanjaro',
df.loc[df['funder'].str.contains('lion'), 'funder'] = 'lions club'
df.loc[df['funder'].str.contains('lions'), 'funder'] = 'lions club'

#  'Lake Tanganyika','Lake Tanganyika Basin','Lake Tanganyika Prodap',
df.loc[df['funder'].str.contains('lake tanganyika'), 'funder'] = 'lake anganyika basin'

#  'Koico', should be Koica
df.loc[df['funder'].str.contains('koico'), 'funder'] = 'koica'

# 'Kanis', 'Kanisa',
df.loc[df['funder'].str.contains('kanis'), 'funder'] = 'kanisa'

#  'Livin','Living Water International'
df.loc[df['funder'].str.contains('livin'), 'funder'] = 'living water international'
df.loc[df['funder'].str.contains('living water'), 'funder'] = 'living water international'
df.loc[df['funder'].str.contains('livin water'), 'funder'] = 'living water international'

#  'Jeshi La Wokovu', 'Jeshi La Wokovu [cida]','Jeshi Lawokovu',
df.loc[df['funder'].str.contains('jeshi la'), 'funder'] = 'jeshi la wokovu'
df.loc[df['funder'].str.contains('lawokovu'), 'funder'] = 'jeshi la wokovu'

# Japan everything with Japan in it  'Japan', 'Japan  Food Aid Counter Part','Japan Aid', 'Japan Embassy','Japan Food','Japan Food Aid', 'Embasy Of Japan In Tanzania','Japan Government',
df.loc[df['funder'].str.contains('japan'), 'funder'] = 'japan'

#  'Italian', 'Italy','Italy Government',
df.loc[df['funder'].str.contains('italy'), 'funder'] = 'italy'

#  'Islam','Islamic', 'Islamic Agency Tanzania', 'Islamic Community', 'Islamic Found', 'Islamic Society',
df.loc[df['funder'].str.contains('islam'), 'funder'] = 'islamic society'
df.loc[df['funder'].str.contains('islamic'), 'funder'] = 'islamic society'

# Contains 'Isf'
df.loc[df['funder'].str.contains('isf'), 'funder'] = 'isf'

#  'Irish Ai','Irish Government',
df.loc[df['funder'].str.contains('irish'), 'funder'] = 'irish'

# 'Moravian', 'Moroil', 'Morovian', 'Morovian Church', 'Morrovian',
df.loc[df['funder'].str.contains('morovian'), 'funder'] = 'morovian church'
df.loc[df['funder'].str.contains('morrovian'), 'funder'] = 'morovian church'
df.loc[df['funder'].str.contains('moravian'), 'funder'] = 'morovian church'
df.loc[df['funder'].str.contains('moroil'), 'funder'] = 'morovian church'

In [245]:
list(df['funder'].sort_values(ascending = True).unique())

['0',
 'aar',
 'abas ka',
 'abasia',
 'abd',
 'abdala',
 'abddwe',
 'abdul',
 'abood',
 'acra',
 'act',
 'act mara',
 'action in a',
 'adap',
 'adb',
 'adf',
 'adp',
 'adra',
 'af',
 'afdp',
 'africa',
 'afriican reli',
 'ahmadia',
 'aic',
 'alia',
 'amref',
 'angrikana',
 'anjuman e seifee',
 'apm',
 'area',
 'artisan',
 'asb',
 'asdp',
 'asgerali n bharwan',
 'auwasa',
 'awf',
 'b.a.p',
 'ba as',
 'bahresa',
 'bakari chimkube',
 'bakwata',
 'ballo',
 'balo',
 'balyehe',
 'banca reale',
 'bank',
 'bao',
 'baric',
 'bathlomew vicent',
 'belgian government',
 'belgij',
 'bened',
 'benguka',
 'bffs',
 'bfwd',
 'bgm',
 'bgss',
 'bhws',
 'bilila',
 'biore',
 'bkhws',
 'boazi',
 'boazi /o',
 'bobby',
 'bokera w',
 'boma saving',
 'bonite bottles ltd',
 'br',
 'bra',
 'brad',
 'brdp',
 'bread of the world',
 'bridge north',
 'british tanza',
 'brown',
 'bruder',
 'bsf',
 'bukumbi',
 'bumabu',
 'c',
 'cafod',
 'caltas',
 'camartec',
 'camavita',
 'canada',
 'care international',
 'caritas',
 

In [192]:
df.to_pickle('clean_df.pkl')