In [2]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)

In [3]:
train_raw = pd.read_csv('./Data/training_set_values.csv')
test_raw = pd.read_csv('./Data/test_set_values.csv')

train_raw['train'] = 1
test_raw['train'] = 0
data = pd.concat([train_raw, test_raw])

In [4]:
# Overview

#data.info()

In [5]:
# Identify missing values in numerical data

int_var = ['population','gps_height','num_private','construction_year']
float_var = ['amount_tsh','longitude']

for var in int_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0]))

for var in float_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0.0]))

print('latitude:')
display(data['latitude'].min())
display(len(data[(data['latitude'] > -0.001) & (data['latitude'] < 0.001)]))

population:


0

26834

gps_height:


-90

25649

num_private:


0

73299

construction_year:


0

25969

amount_tsh:


0.0

52049

longitude:


0.0

2269

latitude:


-11.64944018

2269

In [6]:
# Replace zeros by NaN

for var in int_var:
    data[var].replace(0, np.nan, inplace=True)
    
for var in float_var:
    data[var].replace(0.0, np.nan, inplace=True)

data['latitude'].where((data['latitude'] < -0.001) | (data['latitude'] > 0.001), other= np.nan, inplace=True,axis=0)

train = data[data['train'] == 1]
test = data[data['train'] == 0]

In [7]:
# Duplicate critical columnns for imputation based on normal distribution and random choice

null_features = ['longitude','latitude','gps_height','population','construction_year','amount_tsh']
#no calculations for amount_tsh & num_private since they are dropped later (too many missing values)

for null_feature in null_features:
    data['_'.join([null_feature, 'imp_normal'])] = data[null_feature]
    data['_'.join([null_feature, 'imp_random_choice'])] = data[null_feature]

In [8]:
# Add columns for mean and standard deviation of critical features based on region, ward, overall

divisions = ['region', 'ward']

for null_feature in null_features:
    data['_'.join([null_feature, 'mean', 'overall'])] = train[null_feature].mean()
    data['_'.join([null_feature, 'std', 'overall'])] = train[null_feature].std()
    for division in divisions:
        new_feature_name_mean = '_'.join([null_feature, 'mean', division])
        new_feature_name_std = '_'.join([null_feature, 'std', division])
        
        calcs_mean = train.groupby(division)[null_feature].mean()
        calcs_std = train.groupby(division)[null_feature].std()
        for value in train[division].unique() :
            #data[new_feature_name_mean] = data.apply(lambda row: calcs_mean[row[division]], axis=1)
            #data[new_feature_name_std] = data.apply(lambda row: calcs_std[row[division]], axis=1)
            data.loc[data[division]==value, new_feature_name_mean] = calcs_mean[value]
            data.loc[data[division]==value, new_feature_name_std] = calcs_std[value]

In [9]:
# 1st step: Impute missing values with random numbers generated by normal distribution based on mean, std by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with random numbers generated by normal distribution based on mean, std by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with random numbers generated by normal distribution based on mean, std by 'overall'

divisions_total = ['ward', 'region', 'overall']

for null_feature in null_features:
    for division in divisions_total:
        data['_'.join([null_feature,'imp_normal'])] = data.apply(lambda row: np.random.normal(loc=row['_'.join([null_feature,'mean',division])], scale=row['_'.join([null_feature,'std',division])]) if math.isnan(row['_'.join([null_feature,'imp_normal'])]) else row['_'.join([null_feature,'imp_normal'])], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(null_feature, division, data['_'.join([null_feature,'imp_normal'])].isnull().sum()))

'Missing values after imputation in longitude by ward: 1812'

'Missing values after imputation in longitude by region: 0'

'Missing values after imputation in longitude by overall: 0'

'Missing values after imputation in latitude by ward: 1812'

'Missing values after imputation in latitude by region: 0'

'Missing values after imputation in latitude by overall: 0'

'Missing values after imputation in gps_height by ward: 24251'

'Missing values after imputation in gps_height by region: 14283'

'Missing values after imputation in gps_height by overall: 0'

'Missing values after imputation in population by ward: 24768'

'Missing values after imputation in population by region: 14283'

'Missing values after imputation in population by overall: 0'

'Missing values after imputation in construction_year by ward: 23617'

'Missing values after imputation in construction_year by region: 14283'

'Missing values after imputation in construction_year by overall: 0'

'Missing values after imputation in amount_tsh by ward: 33824'

'Missing values after imputation in amount_tsh by region: 14385'

'Missing values after imputation in amount_tsh by overall: 0'

In [10]:
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,longitude_mean_overall,longitude_std_overall,longitude_mean_region,longitude_std_region,longitude_mean_ward,longitude_std_ward,latitude_mean_overall,latitude_std_overall,latitude_mean_region,latitude_std_region,latitude_mean_ward,latitude_std_ward,gps_height_mean_overall,gps_height_std_overall,gps_height_mean_region,gps_height_std_region,gps_height_mean_ward,gps_height_std_ward,population_mean_overall,population_std_overall,population_mean_region,population_std_region,population_mean_ward,population_std_ward,construction_year_mean_overall,construction_year_std_overall,construction_year_mean_region,construction_year_std_region,construction_year_mean_ward,construction_year_std_ward,amount_tsh_mean_overall,amount_tsh_std_overall,amount_tsh_mean_region,amount_tsh_std_region,amount_tsh_mean_ward,amount_tsh_std_ward
0,69572,6000.0,2011-03-14,Roman,1390.0,Roman,34.938093,-9.856322,none,,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109.0,True,GeoData Consultants Ltd,VWC,Roman,False,1999.0,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,109.0,109.0,1999.0,1999.0,6000.0,6000.0,35.149669,2.607428,34.895989,0.507484,34.93686,0.018817,-5.885572,2.809876,-8.9077,0.695036,-9.849742,0.057023,1018.860839,612.566092,1697.44201,357.935649,1429.972222,85.347185,281.087167,564.68766,120.883051,207.99429,131.5,85.804262,1996.814686,12.472045,1997.441284,11.488838,1997.527778,3.037726,1062.351942,5409.34494,1855.43434,5829.13105,4500.0,2350.81173
1,8776,,2013-03-06,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Zahanati,,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280.0,,GeoData Consultants Ltd,Other,,True,2010.0,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,280.0,280.0,2010.0,2010.0,484.604349,,35.149669,2.607428,34.15394,0.392713,34.560169,0.1111,-5.885572,2.809876,-1.739903,0.270569,-2.069085,0.074345,1018.860839,612.566092,1341.62519,157.456723,1367.591549,54.440683,281.087167,564.68766,538.794312,831.30018,305.169014,322.052302,1996.814686,12.472045,1997.454918,11.537118,1998.057143,14.160453,1062.351942,5409.34494,584.158038,3256.133649,542.857143,450.396651
2,34310,25.0,2013-02-25,Lottery Club,686.0,World vision,37.460664,-3.821329,Kwa Mahundi,,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250.0,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,250.0,250.0,2009.0,2009.0,25.0,25.0,35.149669,2.607428,35.932915,0.696129,37.453988,0.013616,-5.885572,2.809876,-4.286211,0.520436,-3.812915,0.015699,1018.860839,612.566092,1426.077701,352.826622,683.5,2.718251,281.087167,564.68766,317.778269,582.243701,285.0,62.583278,1996.814686,12.472045,2002.114013,9.803441,2009.0,0.0,1062.351942,5409.34494,725.095986,2725.650147,25.0,0.0
3,67743,,2013-01-28,Unicef,263.0,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58.0,True,GeoData Consultants Ltd,VWC,,True,1986.0,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,58.0,58.0,1986.0,1986.0,1278.812337,,35.149669,2.607428,39.397674,0.551495,38.494923,0.036091,-5.885572,2.809876,-10.680456,0.243809,-11.16116,0.045133,1018.860839,612.566092,258.410301,182.988424,275.03125,42.647455,281.087167,564.68766,267.441618,443.022905,83.40625,36.331679,1996.814686,12.472045,1992.373707,13.297554,1993.6875,13.959475,1062.351942,5409.34494,148.481268,619.663404,525.0,548.482756
4,19728,,2011-07-13,Action In A,,Artisan,31.130847,-1.825359,Shuleni,,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,,True,GeoData Consultants Ltd,,,True,,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,31.130847,31.130847,-1.825359,-1.825359,1729.82688,,-13.356147,,1992.498578,,-1304.203362,,35.149669,2.607428,31.233262,0.425853,31.126863,0.015989,-5.885572,2.809876,-1.961466,0.636347,-1.852282,0.03653,1018.860839,612.566092,,,,,281.087167,564.68766,,,,,1996.814686,12.472045,,,,,1062.351942,5409.34494,,,,


In [11]:
# Add columns with list of values in corresponding group of region and ward, respectively
for null_feature in null_features:
    overall_list = list(train[null_feature])
    overall_list = [x for x in overall_list if not math.isnan(x)]
    data['_'.join([null_feature, 'list', 'overall'])] = data.apply(lambda row: overall_list, axis=1)
    display('Overall_list done')
    for division in divisions:
        feature_name = '_'.join([null_feature, 'list', division])
        lists = train.groupby(division)[null_feature].apply(list)
        #data[feature_name] = data.apply(lambda row: lists[row[division]], axis=1)
        #data[feature_name] = data[feature_name].apply(lambda lst: [x for x in lst if not math.isnan(x)])
        #data[feature_name] = data[feature_name].apply(lambda x: 1 if not x else x)
        #display(null_feature, division)
        #data.loc[data[division]==value, feature_name] = lists[value]
        data[feature_name] = data.apply(lambda row: list() if row[division] not in train[division].unique() else lists[row[division]], axis=1)
        data[feature_name] = data[feature_name].apply(lambda lst: [x for x in lst if not math.isnan(x)])
        data[feature_name] = data[feature_name].apply(lambda x: np.nan if not x else x)
        display(null_feature, division)

'Overall_list done'

'longitude'

'region'

'longitude'

'ward'

'Overall_list done'

'latitude'

'region'

'latitude'

'ward'

'Overall_list done'

'gps_height'

'region'

'gps_height'

'ward'

'Overall_list done'

'population'

'region'

'population'

'ward'

'Overall_list done'

'construction_year'

'region'

'construction_year'

'ward'

'Overall_list done'

'amount_tsh'

'region'

'amount_tsh'

'ward'

In [15]:
#for null_feature in null_features:
#    for division in divisions:
#        feature_name = '_'.join([null_feature, 'list', division])
#        data[feature_name] = data[feature_name].apply(lambda x: np.nan if not x else x)

In [42]:
data[(data['id']==72678) | (data['id']==56725)]
#pd.DataFrame(data).to_csv("./Data/data_zwischenstand.csv", index=False)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,longitude_mean_overall,longitude_std_overall,longitude_mean_region,longitude_std_region,longitude_mean_ward,longitude_std_ward,latitude_mean_overall,latitude_std_overall,latitude_mean_region,latitude_std_region,latitude_mean_ward,latitude_std_ward,gps_height_mean_overall,gps_height_std_overall,gps_height_mean_region,gps_height_std_region,gps_height_mean_ward,gps_height_std_ward,population_mean_overall,population_std_overall,population_mean_region,population_std_region,population_mean_ward,population_std_ward,construction_year_mean_overall,construction_year_std_overall,construction_year_mean_region,construction_year_std_region,construction_year_mean_ward,construction_year_std_ward,amount_tsh_mean_overall,amount_tsh_std_overall,amount_tsh_mean_region,amount_tsh_std_region,amount_tsh_mean_ward,amount_tsh_std_ward,longitude_list_overall,longitude_list_region,longitude_list_ward,latitude_list_overall,latitude_list_region,latitude_list_ward,gps_height_list_overall,gps_height_list_region,gps_height_list_ward,population_list_overall,population_list_region,population_list_ward,construction_year_list_overall,construction_year_list_region,construction_year_list_ward,amount_tsh_list_overall,amount_tsh_list_region,amount_tsh_list_ward
168,72678,,2013-01-30,Wvt,,WVT,,,Wvt Tanzania,,Lake Victoria,Ilula,Shinyanga,17,1,Bariadi,Chinamili,,False,GeoData Consultants Ltd,Parastatal,,False,,gravity,gravity,gravity,parastatal,parastatal,other,other,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,31.616211,,-3.455957,,1334.566041,,618.34836,,2001.797364,,5072.950983,,35.149669,2.607428,33.240121,0.765545,,,-5.885572,2.809876,-3.495696,0.328573,,,1018.860839,612.566092,1350.981707,27.83573,,,281.087167,564.68766,428.359756,224.024946,,,1996.814686,12.472045,2002.621951,5.762456,,,1062.351942,5409.34494,3746.666667,12799.769343,,,"[34.93809275, 34.6987661, 37.46066446, 38.4861...","[33.36240982, 32.62061707, 33.79810612, 34.364...",,"[-9.85632177, -2.14746569, -3.82132853, -11.15...","[-3.76636472, -4.22619802, -3.2901937999999995...",,"[1390.0, 1399.0, 686.0, 263.0, 62.0, 1062.0, 1...","[1362.0, 1356.0, 1328.0, 1340.0, 1354.0, 1357....",,"[109.0, 280.0, 250.0, 58.0, 1.0, 345.0, 250.0,...","[450.0, 500.0, 300.0, 250.0, 500.0, 500.0, 500...",,"[1999.0, 2010.0, 2009.0, 1986.0, 2009.0, 2011....","[2008.0, 1996.0, 2005.0, 1998.0, 1997.0, 2008....",,"[6000.0, 25.0, 20.0, 200.0, 500.0, 500.0, 200....","[1000.0, 100.0, 200.0, 100.0, 300.0, 1000.0, 2...",
177,56725,,2013-01-17,Netherlands,,DWE,,,Kikundi Cha Wakina Mama,,Lake Victoria,Mahaha,Shinyanga,17,1,Bariadi,Bunamhala,,,GeoData Consultants Ltd,WUG,,False,,other,other,other,wug,user-group,unknown,unknown,soft,good,enough,enough,shallow well,shallow well,groundwater,other,other,1,34.076974,,-2.891981,,1335.825505,,690.246416,,2003.580018,,4862.008814,,35.149669,2.607428,33.240121,0.765545,34.094471,0.008208,-5.885572,2.809876,-3.495696,0.328573,-2.889424,0.015835,1018.860839,612.566092,1350.981707,27.83573,1335.833333,7.183736,281.087167,564.68766,428.359756,224.024946,404.333333,223.744308,1996.814686,12.472045,2002.621951,5.762456,2000.416667,4.209477,1062.351942,5409.34494,3746.666667,12799.769343,,,"[34.93809275, 34.6987661, 37.46066446, 38.4861...","[33.36240982, 32.62061707, 33.79810612, 34.364...","[34.09904301, 34.09120063, 34.09635422, 34.098...","[-9.85632177, -2.14746569, -3.82132853, -11.15...","[-3.76636472, -4.22619802, -3.2901937999999995...","[-2.87837578, -2.89379028, -2.87099139, -2.885...","[1390.0, 1399.0, 686.0, 263.0, 62.0, 1062.0, 1...","[1362.0, 1356.0, 1328.0, 1340.0, 1354.0, 1357....","[1329.0, 1334.0, 1334.0, 1339.0, 1342.0, 1333....","[109.0, 280.0, 250.0, 58.0, 1.0, 345.0, 250.0,...","[450.0, 500.0, 300.0, 250.0, 500.0, 500.0, 500...","[500.0, 500.0, 500.0, 150.0, 1.0, 500.0, 1.0, ...","[1999.0, 2010.0, 2009.0, 1986.0, 2009.0, 2011....","[2008.0, 1996.0, 2005.0, 1998.0, 1997.0, 2008....","[2009.0, 1997.0, 2000.0, 1997.0, 2001.0, 1996....","[6000.0, 25.0, 20.0, 200.0, 500.0, 500.0, 200....","[1000.0, 100.0, 200.0, 100.0, 300.0, 1000.0, 2...",


In [40]:
# 1st step: Impute missing values with empirical distribution grouped by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with empirical distribution grouped by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with empirical distribution grouped by 'overall'

for null_feature in null_features:
    for division in divisions_total:        
        #data['_'.join([null_feature,'imp_random_choice'])] = data.apply(lambda row: np.random.choice(a=row['_'.join([null_feature,'list',division])]) if math.isnan(row['_'.join([null_feature,'imp_random_choice'])]) else row['_'.join([null_feature,'imp_random_choice'])], axis=1)
        data['_'.join([null_feature,'imp_random_choice'])] = data.apply(lambda row: row['_'.join([null_feature,'imp_random_choice'])] if not math.isnan(row['_'.join([null_feature,'imp_random_choice'])]) else (np.random.choice(a=row['_'.join([null_feature,'list',division])]) if not math.isnan(row['_'.join([null_feature,'list',division])]) else np.nan), axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(null_feature, division, data['_'.join([null_feature,'imp_normal'])].isnull().sum()))

# 1st step: Impute missing values with random numbers generated by random choice on list of not-nan values per ward
#data['longitude_imp_rand_choice'] = data.apply(lambda row: np.random.choice(a=row['longitude_list_ward']) if math.isnan(row['longitude_imp_rand_choice']) else row['longitude_imp_rand_choice'], axis=1)
#data['latitude_imp_rand_choice'] = data.apply(lambda row: np.random.choice(a=row['latitude_list_ward']) if math.isnan(row['latitude_imp_rand_choice']) else row['latitude_imp_rand_choice'], axis=1)

#display('Missing values after imputation by ward: {}'.format(len(data.loc[data['longitude_imp_rand_choice'] == 0])))

# 2nd step: Impute missing values with random numbers generated by random choice on list of not-nan values per region
#data['longitude_imp_rand_choice'] = data.apply(lambda row: np.random.choice(a=row['longitude_list_region']) if row['longitude_imp_rand_choice'] == 0 else row['longitude_imp_rand_choice'], axis=1)
#data['latitude_imp_rand_choice'] = data.apply(lambda row: np.random.choice(a=row['latitude_list_region']) if row['latitude_imp_rand_choice'] == 0 else row['latitude_imp_rand_choice'], axis=1)


TypeError: ('must be real number, not list', 'occurred at index 177')

In [None]:
# Drop columns used for generation of random numbers
drop_columns = list()
measures.append('list')
for gps_feature in gps_features:
    for division in divisions:
        for measure in measures:
            drop_columns.append('_'.join([gps_feature, measure, division]))
data.drop(columns=drop_columns, inplace=True)

In [None]:
# fill numerical null-values by mean/median
# use information from training data to fill null values in new data (represented by test data)

int_var.append('latitude')

# 1st step: group by region, ward (-> null-values remain if there does not exist a single non-null value in a tuple)
for var in int_var:
    train[var].fillna(train.groupby(['region', 'ward'])[var].transform("median"), inplace=True)

for var in int_var:
    test[var].fillna(train.groupby(['region', 'ward'])[var].transform("median"), inplace=True)

for var in float_var:
    train[var].fillna(train.groupby(['region', 'ward'])[var].transform("mean"), inplace=True)
    
for var in float_var:
    test[var].fillna(train.groupby(['region', 'ward'])[var].transform("mean"), inplace=True)
    
# 2nd step: rougher filter
for var in int_var:
    train[var].fillna(train.groupby(['region'])[var].transform("median"), inplace=True)
    
for var in int_var:
    test[var].fillna(train.groupby(['region'])[var].transform("median"), inplace=True)

for var in float_var:
    train[var].fillna(train.groupby(['region'])[var].transform("mean"), inplace=True)
    
for var in float_var:
    test[var].fillna(train.groupby(['region'])[var].transform("mean"), inplace=True)

# 3rd step: rougher filter
for var in int_var:
    train[var].fillna(train[var].median(), inplace=True)
    
for var in int_var:
    test[var].fillna(train[var].median(), inplace=True)

for var in float_var:
    train[var].fillna(train[var].mean(), inplace=True)
    
for var in float_var:
    test[var].fillna(train[var].mean(), inplace=True)

In [None]:
data.isnull().sum()

In [None]:
# create new feature that gives information about operational time
    
data['date_recorded'] = pd.to_datetime(data['date_recorded'])
data['operation_years'] = data.date_recorded.dt.year - data.construction_year
data['operation_years'] = data['operation_years'].astype(int)

In [None]:
# scale numeric features

#num_features=['latitude','longitude','operation_years','amount_tsh', 'gps_height', 'population']
#scaler = MinMaxScaler()

#for s in split:
    #s[num_features] = scaler.fit_transform(s[num_features])

In [None]:
# drop redundant features and features that do not seem to have an impact

data.drop(['extraction_type_group','extraction_type_class','payment','quality_group','source_class','source_type','waterpoint_type_group','management_group','quantity_group','date_recorded','wpt_name','num_private','recorded_by'],axis=1,inplace=True)

In [None]:
data['funder'].value_counts()
data['installer'].value_counts()
data['scheme_name'].value_counts()

In [None]:
# reduce dimension of categorical variables

# funder
data = data.assign(count = data.groupby('funder')['funder'].transform('count'))\
.sort_values(by = ['count','funder'], ascending = [False,True])

data.loc[data['count'] < 1050, 'funder'] = 'Others'
data['funder'].replace(np.nan, 'Others', inplace=True)
del data['count']

# installer
data = data.assign(count = data.groupby('installer')['installer'].transform('count'))\
.sort_values(by = ['count','installer'], ascending = [False,True])

data.loc[data['count'] < 765, 'installer'] = 'Others'
data.loc[data['installer'] == '0', 'installer'] = 'Others'
data['installer'].replace(np.nan, 'Others', inplace=True)
del data['count']

# scheme_name
data = data.assign(count = data.groupby('scheme_name')['scheme_name'].transform('count'))\
.sort_values(by = ['count','scheme_name'], ascending = [False,True])

data.loc[data['count'] < 296, 'scheme_name'] = 'Others'
data.loc[data['scheme_name'] == '0', 'scheme_name'] = 'Others'
data['scheme_name'].replace(np.nan, 'Others', inplace=True)
del data['count']

In [None]:
# factorize features for evaluations

#data['funder'] = pd.factorize(data['funder'])[0]
#data['installer'] = pd.factorize(data['installer'])[0]
#data['basin'] = pd.factorize(data['basin'])[0]
#data['subvillage'] = pd.factorize(data['subvillage'])[0]
#data['region'] = pd.factorize(data['region'])[0]
#data['lga'] = pd.factorize(data['lga'])[0]
#data['ward'] = pd.factorize(data['ward'])[0]
#data['scheme_management'] = pd.factorize(data['scheme_management'])[0]
#data['scheme_name'] = pd.factorize(data['scheme_name'])[0]
#data['extraction_type'] = pd.factorize(data['extraction_type'])[0]
#data['management'] = pd.factorize(data['management'])[0]
#data['payment_type'] = pd.factorize(data['payment_type'])[0]
#data['water_quality'] = pd.factorize(data['water_quality'])[0]
#data['quantity'] = pd.factorize(data['quantity'])[0]
#data['waterpoint_type'] = pd.factorize(data['waterpoint_type'])[0]
#data['permit'] = pd.factorize(data['permit'])[0]
#data['source'] = pd.factorize(data['source'])[0]

In [None]:
data.head()

In [None]:
train_df = data[data["train"] == 1]
test_df = data[data["train"] == 0]

train_df.drop(["train"], axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

In [None]:
pd.DataFrame(train_df).to_csv("./Data/train_cleaned_distr-imp.csv", index=False)
pd.DataFrame(test_df).to_csv("./Data/test_cleaned_distr_imp.csv", index=False)