### Imports

In [5]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction import FeatureHasher
import category_encoders as ce
pd.set_option('display.max_columns', None)

### Loading and viewing the data

In [6]:
train_raw = pd.read_csv('./Data/training_set_values.csv')
test_raw = pd.read_csv('./Data/test_set_values.csv')

train_raw['train'] = 1
test_raw['train'] = 0
data = pd.concat([train_raw, test_raw])

In [None]:
# Overview

#data.info()

### Global variables

In [7]:
data_version = '0.3'

int_var = ['population','gps_height','num_private','construction_year']
float_var = ['amount_tsh','longitude']

features_to_drop = ['extraction_type_group','extraction_type_class','payment','quality_group','source_class','source_type','waterpoint_type_group','management_group','quantity_group','date_recorded','wpt_name','num_private','recorded_by']

null_features = ['longitude','latitude','gps_height','population','construction_year','amount_tsh']
#no calculations for num_private since they are dropped later (too many missing values)

divisions = ['region', 'ward']
divisions_total = ['ward', 'region', 'overall']

# These will be scaled
num_features = ['latitude','longitude','operation_years','amount_tsh', 'gps_height', 'population']

# These will be factorized
cat_features = 'funder installer subvillage public_meeting scheme_management scheme_name permit basin region lga ward extraction_type management payment_type water_quality quantity source waterpoint_type'.split()

### Data preparation

In [8]:
# Identify missing values in numerical data

for var in int_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0]))

for var in float_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0.0]))

print('latitude:')
display(data['latitude'].min())
display(len(data[(data['latitude'] > -0.001) & (data['latitude'] < 0.001)]))

population:


0

26834

gps_height:


-90

25649

num_private:


0

73299

construction_year:


0

25969

amount_tsh:


0.0

52049

longitude:


0.0

2269

latitude:


-11.64944018

2269

In [9]:
# Replace zeros by NaN

for var in int_var:
    data[var].replace(0, np.nan, inplace=True)
    
for var in float_var:
    data[var].replace(0.0, np.nan, inplace=True)

data['latitude'].where((data['latitude'] < -0.001) | (data['latitude'] > 0.001), other= np.nan, inplace=True,axis=0)

train = data[data['train'] == 1]
test = data[data['train'] == 0]

### Imputation of missing values in numerical features

In [None]:
# Duplicate critical columnns for imputation based on normal distribution and random choice

'''for null_feature in null_features:
    data['_'.join([null_feature, 'imp_mean-median'])] = data[null_feature]
    data['_'.join([null_feature, 'imp_normal'])] = data[null_feature]
    data['_'.join([null_feature, 'imp_random_choice'])] = data[null_feature]'''

In [None]:
'''data.head()'''

#### Imputation of numerical features by normal distribution

In [None]:
# Add columns for mean and standard deviation of critical features based on 'region', 'ward' and 'overall'

'''for null_feature in null_features:
    data['_'.join([null_feature, 'mean', 'overall'])] = train[null_feature].mean()
    data['_'.join([null_feature, 'std', 'overall'])] = train[null_feature].std()
    for division in divisions:
        new_feature_name_mean = '_'.join([null_feature, 'mean', division])
        new_feature_name_std = '_'.join([null_feature, 'std', division])
        
        calcs_mean = train.groupby(division)[null_feature].mean()
        calcs_std = train.groupby(division)[null_feature].std()
        for value in train[division].unique() :
            data.loc[data[division]==value, new_feature_name_mean] = calcs_mean[value]
            data.loc[data[division]==value, new_feature_name_std] = calcs_std[value]'''

In [None]:
# 1st step: Impute missing values with random numbers generated by normal distribution based on mean, std by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with random numbers generated by normal distribution based on mean, std by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with random numbers generated by normal distribution based on mean, std by 'overall'

'''for null_feature in null_features:
    for division in divisions_total:
        data['_'.join([null_feature,'imp_normal'])] = data.apply(lambda row: np.random.normal(loc=row['_'.join([null_feature,'mean',division])], scale=row['_'.join([null_feature,'std',division])]) if math.isnan(row['_'.join([null_feature,'imp_normal'])]) else row['_'.join([null_feature,'imp_normal'])], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(null_feature, division, data['_'.join([null_feature,'imp_normal'])].isnull().sum()))'''

In [None]:
#data.head()

#### Imputation of numerical features by random choice

In [None]:
# Add columns with list of values in corresponding group of 'region' and 'ward', respectively

'''for null_feature in null_features:
    overall_list = list(train[null_feature])
    overall_list = [x for x in overall_list if not math.isnan(x)]
    data['_'.join([null_feature, 'list', 'overall'])] = data.apply(lambda row: overall_list, axis=1)
    display(null_feature, 'overall list done')
    for division in divisions:
        feature_name = '_'.join([null_feature, 'list', division])
        lists = train.groupby(division)[null_feature].apply(list)
        data[feature_name] = data.apply(lambda row: list() if row[division] not in train[division].unique() else lists[row[division]], axis=1)
        data[feature_name] = data[feature_name].apply(lambda lst: [x for x in lst if not math.isnan(x)])
        data[feature_name] = data[feature_name].apply(lambda x: np.nan if not x else x)
        display(null_feature, division)'''

In [None]:
# 1st step: Impute missing values with empirical distribution grouped by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with empirical distribution grouped by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with empirical distribution grouped by 'overall'

'''for null_feature in null_features:
    for division in divisions_total:        
        #data['_'.join([null_feature,'imp_random_choice'])] = data.apply(lambda row: np.random.choice(a=row['_'.join([null_feature,'list',division])]) if math.isnan(row['_'.join([null_feature,'imp_random_choice'])]) else row['_'.join([null_feature,'imp_random_choice'])], axis=1)
        data['_'.join([null_feature,'imp_random_choice'])] = data.apply(lambda row: row['_'.join([null_feature,'imp_random_choice'])] if not np.isnan(row['_'.join([null_feature,'imp_random_choice'])]).any() else (np.random.choice(a=row['_'.join([null_feature,'list',division])]) if not np.isnan(row['_'.join([null_feature,'list',division])]).any() else np.nan), axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(null_feature, division, data['_'.join([null_feature,'imp_random_choice'])].isnull().sum()))
 '''       

#### Imputation of numerical features by mean/median

In [None]:
# Add columns for median of critical integer features based on 'region', 'ward', 'overall'

'''float_var.append('latitude')

for var in int_var:
    data['_'.join([var, 'median', 'overall'])] = train[var].median()
    for division in divisions:
        new_feature_name_median = '_'.join([var, 'median', division])
        calcs_median = train.groupby(division)[var].median()
        for value in train[division].unique() :
            data.loc[data[division]==value, new_feature_name_median] = calcs_median[value]'''

In [None]:
# 1st step: Impute missing values with mean and median by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with mean and median by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with overall mean and median

'''for var in float_var:
    for division in divisions_total:
        data['_'.join([var,'imp_mean-median'])] = data.apply(lambda row: row['_'.join([var,'mean',division])] if math.isnan(row['_'.join([var,'imp_mean-median'])]) else row['_'.join([var,'imp_mean-median'])], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(var, division, data['_'.join([var,'imp_mean-median'])].isnull().sum()))

for var in int_var:
    for division in divisions_total:
        data['_'.join([var,'imp_mean-median'])] = data.apply(lambda row: row['_'.join([var,'median',division])] if math.isnan(row['_'.join([var,'imp_mean-median'])]) else row['_'.join([var,'imp_mean-median'])], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(var, division, data['_'.join([var,'imp_mean-median'])].isnull().sum()))
 '''       

In [None]:
#data.isnull().sum()

#### Load imputed data from previous data files and add mean/median imputation as additional columns
Can be used if only categorical features will be changed

In [10]:
imputed_train = pd.read_csv('./Data/train_cleaned_v0.1.csv')
imputed_test = pd.read_csv('./Data/test_cleaned_v0.1.csv')

imputed_data = pd.concat([imputed_train, imputed_test])
#imputed_data.head()

In [11]:
keep_columns = 'id longitude latitude gps_height amount_tsh population construction_year'.split()
for col in 'longitude latitude gps_height amount_tsh population construction_year'.split():
    keep_columns.append('_'.join([col, 'imp_normal']))
    keep_columns.append('_'.join([col, 'imp_random_choice']))

In [12]:
imputed_data = imputed_data[keep_columns]
#imputed_data.head()

In [13]:
columns = imputed_data.columns
new_columns = list()
for col in columns:
    if 'imp' in col or col == 'id':
        new_columns.append(col)
    else:
        new_columns.append('_'.join([col, 'imp_mean-median']))

In [14]:
imputed_data.columns = new_columns
imputed_data.head()

Unnamed: 0,id,longitude_imp_mean-median,latitude_imp_mean-median,gps_height_imp_mean-median,amount_tsh_imp_mean-median,population_imp_mean-median,construction_year_imp_mean-median,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice
0,69572,34.938093,-9.856322,1390.0,6000.0,109.0,1999.0,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,6000.0,6000.0,109.0,109.0,1999.0,1999.0
1,8776,34.698766,-2.147466,1399.0,542.857143,280.0,2010.0,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,74.349311,1000.0,280.0,280.0,2010.0,2010.0
2,34310,37.460664,-3.821329,686.0,25.0,250.0,2009.0,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,25.0,25.0,250.0,250.0,2009.0,2009.0
3,67743,38.486161,-11.155298,263.0,525.0,58.0,1986.0,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,-409.294439,1000.0,58.0,58.0,1986.0,1986.0
4,19728,31.130847,-1.825359,1167.0,1062.351942,150.0,2000.0,31.130847,31.130847,-1.825359,-1.825359,1216.846742,1161.0,1500.665788,300.0,-333.803882,150.0,1996.732218,2008.0


In [15]:
data = data.merge(imputed_data, on='id')

In [16]:
data.isnull().sum()

id                                         0
amount_tsh                             52049
date_recorded                              0
funder                                  4504
gps_height                             25649
installer                               4532
longitude                               2269
latitude                                2269
wpt_name                                   0
num_private                            73299
basin                                      0
subvillage                               470
region                                     0
region_code                                0
district_code                              0
lga                                        0
ward                                       0
population                             26834
public_meeting                          4155
recorded_by                                0
scheme_management                       4846
scheme_name                            35258
permit    

### Feature generation

In [17]:
# Create new feature that gives information about operational time

imputation_methods = ['mean-median', 'normal', 'random_choice']
data['date_recorded'] = pd.to_datetime(data['date_recorded'])

for method in imputation_methods:
    data['_'.join(['operation_years_imp', method])] = data.date_recorded.dt.year - data['_'.join(['construction_year_imp', method])]
    data['_'.join(['operation_years_imp', method])] = data['_'.join(['operation_years_imp', method])].astype(int)

In [18]:
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train,longitude_imp_mean-median,latitude_imp_mean-median,gps_height_imp_mean-median,amount_tsh_imp_mean-median,population_imp_mean-median,construction_year_imp_mean-median,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,operation_years_imp_mean-median,operation_years_imp_normal,operation_years_imp_random_choice
0,69572,6000.0,2011-03-14,Roman,1390.0,Roman,34.938093,-9.856322,none,,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109.0,True,GeoData Consultants Ltd,VWC,Roman,False,1999.0,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1,34.938093,-9.856322,1390.0,6000.0,109.0,1999.0,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,6000.0,6000.0,109.0,109.0,1999.0,1999.0,12,12,12
1,8776,,2013-03-06,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Zahanati,,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280.0,,GeoData Consultants Ltd,Other,,True,2010.0,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,34.698766,-2.147466,1399.0,542.857143,280.0,2010.0,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,74.349311,1000.0,280.0,280.0,2010.0,2010.0,3,3,3
2,34310,25.0,2013-02-25,Lottery Club,686.0,World vision,37.460664,-3.821329,Kwa Mahundi,,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250.0,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1,37.460664,-3.821329,686.0,25.0,250.0,2009.0,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,25.0,25.0,250.0,250.0,2009.0,2009.0,4,4,4
3,67743,,2013-01-28,Unicef,263.0,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58.0,True,GeoData Consultants Ltd,VWC,,True,1986.0,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1,38.486161,-11.155298,263.0,525.0,58.0,1986.0,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,-409.294439,1000.0,58.0,58.0,1986.0,1986.0,27,27,27
4,19728,,2011-07-13,Action In A,,Artisan,31.130847,-1.825359,Shuleni,,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,,True,GeoData Consultants Ltd,,,True,,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,31.130847,-1.825359,1167.0,1062.351942,150.0,2000.0,31.130847,31.130847,-1.825359,-1.825359,1216.846742,1161.0,1500.665788,300.0,-333.803882,150.0,1996.732218,2008.0,11,14,3


### Scaling of numerical features

In [None]:
# Scale numerical features
'''
scaler = MinMaxScaler()

for s in split:
    s[num_features] = scaler.fit_transform(s[num_features])
'''

### Drop irrelevant features

In [None]:
# Drop columns used for imputation and generation of random numbers
'''
drop_columns = list()
measures = 'mean std list'.split()
for null_feature in null_features:
    for division in divisions_total:
        for measure in measures:
            drop_columns.append('_'.join([null_feature, measure, division]))
            
for var in int_var:
    for division in divisions_total:
        for measure in measures:
            drop_columns.append('_'.join([var, 'median', division]))
            
data.drop(columns=drop_columns, inplace=True)'''

In [19]:
# Drop redundant features and features that do not seem to have an impact

data.drop(columns=features_to_drop,axis=1,inplace=True)
data.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,train,longitude_imp_mean-median,latitude_imp_mean-median,gps_height_imp_mean-median,amount_tsh_imp_mean-median,population_imp_mean-median,construction_year_imp_mean-median,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,operation_years_imp_mean-median,operation_years_imp_normal,operation_years_imp_random_choice
0,69572,6000.0,Roman,1390.0,Roman,34.938093,-9.856322,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109.0,True,VWC,Roman,False,1999.0,gravity,vwc,annually,soft,enough,spring,communal standpipe,1,34.938093,-9.856322,1390.0,6000.0,109.0,1999.0,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,6000.0,6000.0,109.0,109.0,1999.0,1999.0,12,12,12
1,8776,,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280.0,,Other,,True,2010.0,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,1,34.698766,-2.147466,1399.0,542.857143,280.0,2010.0,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,74.349311,1000.0,280.0,280.0,2010.0,2010.0,3,3,3
2,34310,25.0,Lottery Club,686.0,World vision,37.460664,-3.821329,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250.0,True,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,vwc,per bucket,soft,enough,dam,communal standpipe multiple,1,37.460664,-3.821329,686.0,25.0,250.0,2009.0,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,25.0,25.0,250.0,250.0,2009.0,2009.0,4,4,4
3,67743,,Unicef,263.0,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58.0,True,VWC,,True,1986.0,submersible,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,1,38.486161,-11.155298,263.0,525.0,58.0,1986.0,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,-409.294439,1000.0,58.0,58.0,1986.0,1986.0,27,27,27
4,19728,,Action In A,,Artisan,31.130847,-1.825359,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,,True,,,True,,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,1,31.130847,-1.825359,1167.0,1062.351942,150.0,2000.0,31.130847,31.130847,-1.825359,-1.825359,1216.846742,1161.0,1500.665788,300.0,-333.803882,150.0,1996.732218,2008.0,11,14,3


### Preparation of categorical features

No grouping and factorizing of categorical features in data version 0.1; missing values replaced by 'Missing' only

#### Replace unique values that only appear in the test data set 
(either by 'Others' or by the most frequent value in the corresponding column of the train data set)

Used for data version 0.3

In [20]:
for cat in cat_features:
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'data', data[cat].nunique()))

'funder, train: 1897'

'funder, test: 980'

'funder, data: 2140'

'installer, train: 2145'

'installer, test: 1091'

'installer, data: 2410'

'subvillage, train: 19287'

'subvillage, test: 8443'

'subvillage, data: 21425'

'public_meeting, train: 2'

'public_meeting, test: 2'

'public_meeting, data: 2'

'scheme_management, train: 12'

'scheme_management, test: 11'

'scheme_management, data: 12'

'scheme_name, train: 2696'

'scheme_name, test: 1789'

'scheme_name, data: 2868'

'permit, train: 2'

'permit, test: 2'

'permit, data: 2'

'basin, train: 9'

'basin, test: 9'

'basin, data: 9'

'region, train: 21'

'region, test: 21'

'region, data: 21'

'lga, train: 125'

'lga, test: 125'

'lga, data: 125'

'ward, train: 2092'

'ward, test: 1959'

'ward, data: 2098'

'extraction_type, train: 18'

'extraction_type, test: 17'

'extraction_type, data: 18'

'management, train: 12'

'management, test: 12'

'management, data: 12'

'payment_type, train: 7'

'payment_type, test: 7'

'payment_type, data: 7'

'water_quality, train: 8'

'water_quality, test: 8'

'water_quality, data: 8'

'quantity, train: 5'

'quantity, test: 5'

'quantity, data: 5'

'source, train: 10'

'source, test: 10'

'source, data: 10'

'waterpoint_type, train: 7'

'waterpoint_type, test: 7'

'waterpoint_type, data: 7'

In [21]:
# Get list of features that contain values in the test which don't appear in the training data
unique_test_features = list()
for cat in cat_features:
    if train[cat].nunique() < data[cat].nunique():
        unique_test_features.append(cat)
unique_test_features

['funder', 'installer', 'subvillage', 'scheme_name', 'ward']

In [22]:
# Set unique values that appear in the test dataset only to 'Others' if 'Others' appears in the training dataset, else set it to most frequent value in the corresponding column
for feature in unique_test_features:
    train_values = train[feature].unique().tolist()
    test_values = test[feature].unique().tolist()
    merged_values = pd.DataFrame(train_values).merge(pd.DataFrame(test_values), how='right', indicator=True)
    unique_test_values = list(merged_values[0].loc[merged_values['_merge'] == 'right_only'])
    replace_value = 'Others' if 'Others' in train_values else train[feature].mode()
    for value in unique_test_values:
        data.loc[data[feature] == value, feature] = replace_value  

In [23]:
for cat in cat_features:
    display('{}, {}: {}'.format(cat, 'NaNs', data[cat].isnull().sum()))
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'overall', data[cat].nunique()))

'funder, NaNs: 4504'

'funder, train: 1897'

'funder, test: 980'

'funder, overall: 1897'

'installer, NaNs: 4532'

'installer, train: 2145'

'installer, test: 1091'

'installer, overall: 2145'

'subvillage, NaNs: 2859'

'subvillage, train: 19287'

'subvillage, test: 8443'

'subvillage, overall: 19287'

'public_meeting, NaNs: 4155'

'public_meeting, train: 2'

'public_meeting, test: 2'

'public_meeting, overall: 2'

'scheme_management, NaNs: 4846'

'scheme_management, train: 12'

'scheme_management, test: 11'

'scheme_management, overall: 12'

'scheme_name, NaNs: 35445'

'scheme_name, train: 2696'

'scheme_name, test: 1789'

'scheme_name, overall: 2696'

'permit, NaNs: 3793'

'permit, train: 2'

'permit, test: 2'

'permit, overall: 2'

'basin, NaNs: 0'

'basin, train: 9'

'basin, test: 9'

'basin, overall: 9'

'region, NaNs: 0'

'region, train: 21'

'region, test: 21'

'region, overall: 21'

'lga, NaNs: 0'

'lga, train: 125'

'lga, test: 125'

'lga, overall: 125'

'ward, NaNs: 10'

'ward, train: 2092'

'ward, test: 1959'

'ward, overall: 2092'

'extraction_type, NaNs: 0'

'extraction_type, train: 18'

'extraction_type, test: 17'

'extraction_type, overall: 18'

'management, NaNs: 0'

'management, train: 12'

'management, test: 12'

'management, overall: 12'

'payment_type, NaNs: 0'

'payment_type, train: 7'

'payment_type, test: 7'

'payment_type, overall: 7'

'water_quality, NaNs: 0'

'water_quality, train: 8'

'water_quality, test: 8'

'water_quality, overall: 8'

'quantity, NaNs: 0'

'quantity, train: 5'

'quantity, test: 5'

'quantity, overall: 5'

'source, NaNs: 0'

'source, train: 10'

'source, test: 10'

'source, overall: 10'

'waterpoint_type, NaNs: 0'

'waterpoint_type, train: 7'

'waterpoint_type, test: 7'

'waterpoint_type, overall: 7'

#### Group feature categories other than top categories into 'Others'

In [None]:
# Reduce dimension of categorical variables with high dimensionality 

'''dim_red_features = 'funder installer scheme_name lga ward'.split()
for feature in dim_red_features:
    train = train.assign(count = train.groupby(feature)[feature].transform('count')).sort_values(by = ['count',feature], ascending = [False,True])
    top_values = train.drop_duplicates('count')
    top_values = list(top_values.nlargest(10, 'count')[feature])
    data[feature] = data[feature].apply(lambda x: x if (x in top_values) | (str(x) == 'nan') else 'Others')'''

In [None]:
'''for cat in cat_features:
    display('{}, {}: {}'.format(cat, 'NaNs', data[cat].isnull().sum()))
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'overall', data[cat].nunique()))'''

#### Replace missing values by 'Missing'
Used in data version 0.3

In [25]:
# Fill missing values in categorical features by 'Missing'

for feature in cat_features:
    data[feature].replace(np.nan, 'Missing', inplace=True)

In [None]:
'''data['funder'].value_counts()
data['installer'].value_counts()
data['scheme_name'].value_counts()'''

In [26]:
data.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,train,longitude_imp_mean-median,latitude_imp_mean-median,gps_height_imp_mean-median,amount_tsh_imp_mean-median,population_imp_mean-median,construction_year_imp_mean-median,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,operation_years_imp_mean-median,operation_years_imp_normal,operation_years_imp_random_choice
0,69572,6000.0,Roman,1390.0,Roman,34.938093,-9.856322,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109.0,True,VWC,Roman,False,1999.0,gravity,vwc,annually,soft,enough,spring,communal standpipe,1,34.938093,-9.856322,1390.0,6000.0,109.0,1999.0,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,6000.0,6000.0,109.0,109.0,1999.0,1999.0,12,12,12
1,8776,,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280.0,Missing,Other,Missing,True,2010.0,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,1,34.698766,-2.147466,1399.0,542.857143,280.0,2010.0,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,74.349311,1000.0,280.0,280.0,2010.0,2010.0,3,3,3
2,34310,25.0,Lottery Club,686.0,World vision,37.460664,-3.821329,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250.0,True,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,vwc,per bucket,soft,enough,dam,communal standpipe multiple,1,37.460664,-3.821329,686.0,25.0,250.0,2009.0,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,25.0,25.0,250.0,250.0,2009.0,2009.0,4,4,4
3,67743,,Unicef,263.0,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58.0,True,VWC,Missing,True,1986.0,submersible,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,1,38.486161,-11.155298,263.0,525.0,58.0,1986.0,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,-409.294439,1000.0,58.0,58.0,1986.0,1986.0,27,27,27
4,19728,,Action In A,,Artisan,31.130847,-1.825359,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,,True,Missing,Missing,True,,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,1,31.130847,-1.825359,1167.0,1062.351942,150.0,2000.0,31.130847,31.130847,-1.825359,-1.825359,1216.846742,1161.0,1500.665788,300.0,-333.803882,150.0,1996.732218,2008.0,11,14,3


#### Convert categorical features into numerical features by adding a column with their probability for each target class

In [None]:
'''train_labels = pd.read_csv('./Data/training_set_labels.csv')
train = train.merge(train_labels, on="id")
train.head()'''

In [None]:
'''for feature in cat_features:
    train['count'] = train.groupby(feature)[feature].transform('count')
train.head()'''

In [None]:
'''def get_percentage(groups, row, status_group, feature):
    try:
        sg_count = groups['count'].loc[(groups[feature] == row[feature]) & (groups['status_group'] == status_group)].item()
        total_count = train['count'].loc[train['id'] == row['id']].item()
        return sg_count / total_count
    except ValueError:
        return 0'''

In [None]:
'''status_groups = 'functional,non functional,functional needs repair'.split(',')
for feature in cat_features:
    groups = pd.DataFrame({'count': train.groupby([feature, 'status_group']).size()}).reset_index()
    for status_group in status_groups:
        data['_'.join(['pct', feature, status_group])] = data.apply(lambda row: get_percentage(groups=groups, row=row, status_group=status_group, feature=feature), axis=1)
    display(feature + ' done')'''
        

In [None]:
#data.head()


#### Factorize categorical features

In [None]:
# Factorize features for evaluations

'''data['funder'] = pd.factorize(data['funder'])[0]
data['installer'] = pd.factorize(data['installer'])[0]
data['basin'] = pd.factorize(data['basin'])[0]
data['subvillage'] = pd.factorize(data['subvillage'])[0]
data['region'] = pd.factorize(data['region'])[0]
data['lga'] = pd.factorize(data['lga'])[0]
data['ward'] = pd.factorize(data['ward'])[0]
data['scheme_management'] = pd.factorize(data['scheme_management'])[0]
data['scheme_name'] = pd.factorize(data['scheme_name'])[0]
data['extraction_type'] = pd.factorize(data['extraction_type'])[0]
data['management'] = pd.factorize(data['management'])[0]
data['payment_type'] = pd.factorize(data['payment_type'])[0]
data['water_quality'] = pd.factorize(data['water_quality'])[0]
data['quantity'] = pd.factorize(data['quantity'])[0]
data['waterpoint_type'] = pd.factorize(data['waterpoint_type'])[0]
data['permit'] = pd.factorize(data['permit'])[0]
data['source'] = pd.factorize(data['source'])[0]'''

#### OneHotEncoding

In [None]:
# Remove 'subvillage' from cat_features its number of unique values leads to too many dummies
#cat_features.remove('subvillage')
# Get dummies for categorical features and add them to dataframe
#data = pd.concat([data, pd.get_dummies(data[cat_features], dummy_na=True)], axis=1)

In [None]:
#data.shape

In [None]:
#data.head()

#### Binary Encoding
Used in data version 0.3

In [28]:
ce_bin = ce.BinaryEncoder(cols=cat_features)
data = ce_bin.fit_transform(data)

In [30]:
data.shape

(74250, 160)

In [31]:
data.head()

Unnamed: 0,id,amount_tsh,funder_0,funder_1,funder_2,funder_3,funder_4,funder_5,funder_6,funder_7,funder_8,funder_9,funder_10,funder_11,gps_height,installer_0,installer_1,installer_2,installer_3,installer_4,installer_5,installer_6,installer_7,installer_8,installer_9,installer_10,installer_11,installer_12,longitude,latitude,basin_0,basin_1,basin_2,basin_3,basin_4,subvillage_0,subvillage_1,subvillage_2,subvillage_3,subvillage_4,subvillage_5,subvillage_6,subvillage_7,subvillage_8,subvillage_9,subvillage_10,subvillage_11,subvillage_12,subvillage_13,subvillage_14,subvillage_15,region_0,region_1,region_2,region_3,region_4,region_5,region_code,district_code,lga_0,lga_1,lga_2,lga_3,lga_4,lga_5,lga_6,lga_7,ward_0,ward_1,ward_2,ward_3,ward_4,ward_5,ward_6,ward_7,ward_8,ward_9,ward_10,ward_11,ward_12,population,public_meeting_0,public_meeting_1,public_meeting_2,scheme_management_0,scheme_management_1,scheme_management_2,scheme_management_3,scheme_management_4,scheme_name_0,scheme_name_1,scheme_name_2,scheme_name_3,scheme_name_4,scheme_name_5,scheme_name_6,scheme_name_7,scheme_name_8,scheme_name_9,scheme_name_10,scheme_name_11,scheme_name_12,permit_0,permit_1,permit_2,construction_year,extraction_type_0,extraction_type_1,extraction_type_2,extraction_type_3,extraction_type_4,extraction_type_5,management_0,management_1,management_2,management_3,management_4,payment_type_0,payment_type_1,payment_type_2,payment_type_3,water_quality_0,water_quality_1,water_quality_2,water_quality_3,quantity_0,quantity_1,quantity_2,quantity_3,source_0,source_1,source_2,source_3,source_4,waterpoint_type_0,waterpoint_type_1,waterpoint_type_2,waterpoint_type_3,train,longitude_imp_mean-median,latitude_imp_mean-median,gps_height_imp_mean-median,amount_tsh_imp_mean-median,population_imp_mean-median,construction_year_imp_mean-median,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,operation_years_imp_mean-median,operation_years_imp_normal,operation_years_imp_random_choice
0,69572,6000.0,0,0,0,0,0,0,0,0,0,0,0,1,1390.0,0,0,0,0,0,0,0,0,0,0,0,0,1,34.938093,-9.856322,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,11,5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,109.0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1999.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,1,34.938093,-9.856322,1390.0,6000.0,109.0,1999.0,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,6000.0,6000.0,109.0,109.0,1999.0,1999.0,12,12,12
1,8776,,0,0,0,0,0,0,0,0,0,0,1,0,1399.0,0,0,0,0,0,0,0,0,0,0,0,1,0,34.698766,-2.147466,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,20,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,280.0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2010.0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,1,34.698766,-2.147466,1399.0,542.857143,280.0,2010.0,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,74.349311,1000.0,280.0,280.0,2010.0,2010.0,3,3,3
2,34310,25.0,0,0,0,0,0,0,0,0,0,0,1,1,686.0,0,0,0,0,0,0,0,0,0,0,0,1,1,37.460664,-3.821329,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,21,4,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,250.0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,2009.0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,1,37.460664,-3.821329,686.0,25.0,250.0,2009.0,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,25.0,25.0,250.0,250.0,2009.0,2009.0,4,4,4
3,67743,,0,0,0,0,0,0,0,0,0,1,0,0,263.0,0,0,0,0,0,0,0,0,0,0,1,0,0,38.486161,-11.155298,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,90,63,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,58.0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1986.0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,1,38.486161,-11.155298,263.0,525.0,58.0,1986.0,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,-409.294439,1000.0,58.0,58.0,1986.0,1986.0,27,27,27
4,19728,,0,0,0,0,0,0,0,0,0,1,0,1,,0,0,0,0,0,0,0,0,0,0,1,0,1,31.130847,-1.825359,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,18,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,31.130847,-1.825359,1167.0,1062.351942,150.0,2000.0,31.130847,31.130847,-1.825359,-1.825359,1216.846742,1161.0,1500.665788,300.0,-333.803882,150.0,1996.732218,2008.0,11,14,3


#### Feature Hashing

In [None]:
#ce_hash = ce.HashingEncoder(cols=cat_features)
#data = ce_hash.fit_transform(data)

In [None]:
#data.head()

### Split data into train and test data set

In [32]:
train_df = data[data["train"] == 1]
test_df = data[data["train"] == 0]

train_df.drop(["train"], axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


### Save data to csv files

In [33]:
pd.DataFrame(train_df).to_csv("./Data/train_cleaned_v" + data_version + ".csv", index=False)
pd.DataFrame(test_df).to_csv("./Data/test_cleaned_v" + data_version + ".csv", index=False)