In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [39]:
data_labels = pd.read_csv('./Data/training_set_values.csv')
data_values = pd.read_csv('./Data/training_set_labels.csv')
data = data_values.merge(data_labels, on = 'id')

In [40]:
data.head()

Unnamed: 0,id,status_group,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,functional,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,functional,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,functional,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,non functional,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,functional,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [41]:
# overview

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
id                       59400 non-null int64
status_group             59400 non-null object
amount_tsh               59400 non-null float64
date_recorded            59400 non-null object
funder                   55765 non-null object
gps_height               59400 non-null int64
installer                55745 non-null object
longitude                59400 non-null float64
latitude                 59400 non-null float64
wpt_name                 59400 non-null object
num_private              59400 non-null int64
basin                    59400 non-null object
subvillage               59029 non-null object
region                   59400 non-null object
region_code              59400 non-null int64
district_code            59400 non-null int64
lga                      59400 non-null object
ward                     59400 non-null object
population               59400 non-null int64
p

In [42]:
# identify missing values in numerical data

data.isnull().sum()

print('population:')
display(data['population'].min())
display(len(data[data['population'] == 0]))

print('gps_height:')
display(len(data[data['gps_height'] == 0.0]))

print('amount_tsh:')
display(data['amount_tsh'].min())
display(len(data[data['amount_tsh'] == 0.0]))

print('longitude:')
display(data['longitude'].min())
display(len(data[data['longitude'] == 0.0]))

print('latitude:')
display(data['latitude'].min())
display(len(data[(data['latitude'] > -0.001) & (data['latitude'] < 0.001)]))

print('contruction_year')
display(data['construction_year'].min())
display(len(data[data['construction_year'] == 0]))

print('num_private:')
display(data['num_private'].min())
display(len(data[data['num_private'] == 0]))

print('district_code')
display(data['district_code'].min())
display(len(data[data['district_code'] == 0]))

population:


0

21381

gps_height:


20438

amount_tsh:


0.0

41639

longitude:


0.0

1812

latitude:


-11.64944018

1812

contruction_year


0

20709

num_private:


0

58643

district_code


0

23

In [43]:
# replace zeros by NaN

data['population'].replace(0, np.nan, inplace=True)
data['gps_height'].replace(0, np.nan, inplace=True)
data['amount_tsh'].replace(0.0, np.nan, inplace=True)
data['longitude'].replace(0.0, np.nan, inplace=True)
data['latitude'].where((data['latitude'] < -0.001) | (data['latitude'] > 0.001), other= np.nan, inplace=True,axis=0)
data['construction_year'].replace(0, np.nan, inplace=True)
data['num_private'].replace(0, np.nan, inplace=True)

In [44]:
data.isnull().sum()

id                           0
status_group                 0
amount_tsh               41639
date_recorded                0
funder                    3635
gps_height               20438
installer                 3655
longitude                 1812
latitude                  1812
wpt_name                     0
num_private              58643
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population               21381
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year        20709
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_qu

In [45]:
# 1st step: group by region, ward (-> null-values remain if there does not exist a single non-null value in a tuple)
data["population"].fillna(data.groupby(['region', 'ward'])["population"].transform("median"), inplace=True)
data["gps_height"].fillna(data.groupby(['region', 'ward'])["gps_height"].transform("median"), inplace=True)
data["amount_tsh"].fillna(data.groupby(['region', 'ward'])["amount_tsh"].transform("mean"), inplace=True)
data["longitude"].fillna(data.groupby(['region', 'ward'])["longitude"].transform("mean"), inplace=True)
data["latitude"].fillna(data.groupby(['region', 'ward'])["latitude"].transform("mean"), inplace=True)
data["construction_year"].fillna(data.groupby(['region', 'ward'])["construction_year"].transform("median"), inplace=True)

# 2nd step: rougher filter
data["population"].fillna(data.groupby(['region'])["population"].transform("median"), inplace=True)
data["gps_height"].fillna(data.groupby(['region'])["gps_height"].transform("median"), inplace=True)
data["amount_tsh"].fillna(data.groupby(['region'])["amount_tsh"].transform("mean"), inplace=True)
data["longitude"].fillna(data.groupby(['region'])["longitude"].transform("mean"), inplace=True)
data["latitude"].fillna(data.groupby(['region'])["latitude"].transform("mean"), inplace=True)
data["construction_year"].fillna(data.groupby(['region'])["construction_year"].transform("median"), inplace=True)

# 3rd step: rougher filter
data["population"].fillna(data["population"].median(), inplace=True)
data["gps_height"].fillna(data["gps_height"].median(), inplace=True)
data["amount_tsh"].fillna(data["amount_tsh"].mean(), inplace=True)
data["longitude"].fillna(data["longitude"].mean(), inplace=True)
data["latitude"].fillna(data["latitude"].mean(), inplace=True)
data["construction_year"].fillna(data["construction_year"].median(), inplace=True)

In [46]:
data.isnull().sum()

id                           0
status_group                 0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private              58643
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_qu

In [47]:
# create new feature that gives information about operational time

data['date_recorded'] = pd.to_datetime(data['date_recorded'])
# print(data.date_recorded.dt.year.head(5))
# print(data.construction_year.head(5))
data['operation_years'] = data.date_recorded.dt.year - data.construction_year
data['operation_years'] = data['operation_years'].astype(int)

In [48]:
# scale numeric features

num_features=['latitude','longitude','operation_years','amount_tsh', 'gps_height', 'population']
scaler = MinMaxScaler()
data[num_features] = scaler.fit_transform(data[num_features])

  return self.partial_fit(X, y)


In [49]:
data.head(5)

Unnamed: 0,id,status_group,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,operation_years
0,69572,functional,0.017142,2011-03-14,Roman,0.517483,Roman,0.496455,0.168353,none,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,0.316667
1,8776,functional,0.00155,2013-03-06,Grumeti,0.520629,GRUMETI,0.474167,0.892122,Zahanati,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,0.166667
2,34310,functional,7.1e-05,2013-02-25,Lottery Club,0.271329,World vision,0.731374,0.734967,Kwa Mahundi,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,0.183333
3,67743,non functional,0.001499,2013-01-28,Unicef,0.123427,UNICEF,0.826875,0.046394,Zahanati Ya Nanyumbu,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,0.566667
4,19728,functional,0.003743,2011-07-13,Action In A,0.448252,Artisan,0.141899,0.922364,Shuleni,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,0.3


In [50]:
# drop redundant features

data.drop(['extraction_type_group','extraction_type_class','payment','quality_group','source_class','source_type','waterpoint_type_group','management_group','quantity_group','date_recorded','wpt_name','num_private','recorded_by'],axis=1,inplace=True)

In [51]:
# factorize features for evaluations

data['funder'] = pd.factorize(data['funder'])[0]
data['installer'] = pd.factorize(data['installer'])[0]
data['basin'] = pd.factorize(data['basin'])[0]
data['subvillage'] = pd.factorize(data['subvillage'])[0]
data['region'] = pd.factorize(data['region'])[0]
data['lga'] = pd.factorize(data['lga'])[0]
data['ward'] = pd.factorize(data['ward'])[0]
data['scheme_management'] = pd.factorize(data['scheme_management'])[0]
data['scheme_name'] = pd.factorize(data['scheme_name'])[0]
data['extraction_type'] = pd.factorize(data['extraction_type'])[0]
data['management'] = pd.factorize(data['management'])[0]
data['payment_type'] = pd.factorize(data['payment_type'])[0]
data['water_quality'] = pd.factorize(data['water_quality'])[0]
data['quantity'] = pd.factorize(data['quantity'])[0]
data['waterpoint_type'] = pd.factorize(data['waterpoint_type'])[0]
data['permit'] = pd.factorize(data['permit'])[0]
data['source'] = pd.factorize(data['source'])[0]

In [52]:
data.head()

Unnamed: 0,id,status_group,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,...,permit,construction_year,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,operation_years
0,69572,functional,0.017142,0,0.517483,0,0.496455,0.168353,0,0,...,0,1999.0,0,0,0,0,0,0,0,0.316667
1,8776,functional,0.00155,1,0.520629,1,0.474167,0.892122,1,1,...,1,2010.0,0,1,1,0,1,1,0,0.166667
2,34310,functional,7.1e-05,2,0.271329,2,0.731374,0.734967,2,2,...,1,2009.0,0,0,2,0,0,2,1,0.183333
3,67743,non functional,0.001499,3,0.123427,3,0.826875,0.046394,3,3,...,1,1986.0,1,0,1,0,2,3,1,0.566667
4,19728,functional,0.003743,4,0.448252,4,0.141899,0.922364,1,4,...,1,1999.5,0,2,1,0,3,1,0,0.3


In [53]:
data.to_csv(path_or_buf='./Data/preprocessed_training_data.csv', index=False)

In [54]:
# TODO: reduce feature dimension for categorical variables (-> Sang)