In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
train = pd.read_csv('PumpItUp_Training.csv')
test = pd.read_csv('PumpItUp_Test.csv')

train['train'] = 1
test['train'] = 0
data = pd.concat([train, test])

In [4]:
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1


In [5]:
# overview

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74250 entries, 0 to 14849
Data columns (total 41 columns):
id                       74250 non-null int64
amount_tsh               74250 non-null float64
date_recorded            74250 non-null object
funder                   69746 non-null object
gps_height               74250 non-null int64
installer                69718 non-null object
longitude                74250 non-null float64
latitude                 74250 non-null float64
wpt_name                 74250 non-null object
num_private              74250 non-null int64
basin                    74250 non-null object
subvillage               73780 non-null object
region                   74250 non-null object
region_code              74250 non-null int64
district_code            74250 non-null int64
lga                      74250 non-null object
ward                     74250 non-null object
population               74250 non-null int64
public_meeting           70095 non-null object
r

In [6]:
# identify missing values in numerical data

int_var = ['population','gps_height','num_private','construction_year']
float_var = ['amount_tsh','longitude']

for var in int_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0]))

for var in float_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0.0]))

print('latitude:')
display(data['latitude'].min())
display(len(data[(data['latitude'] > -0.001) & (data['latitude'] < 0.001)]))


population:


0

26834

gps_height:


-90

25649

num_private:


0

73299

construction_year:


0

25969

amount_tsh:


0.0

52049

longitude:


0.0

2269

latitude:


-11.64944018

2269

In [7]:
# replace zeros by NaN

for var in int_var:
    data[var].replace(0, np.nan, inplace=True)
    
for var in float_var:
    data[var].replace(0.0, np.nan, inplace=True)

data['latitude'].where((data['latitude'] < -0.001) | (data['latitude'] > 0.001), other= np.nan, inplace=True,axis=0)

In [8]:
data.isnull().sum()

id                           0
amount_tsh               52049
date_recorded                0
funder                    4504
gps_height               25649
installer                 4532
longitude                 2269
latitude                  2269
wpt_name                     0
num_private              73299
basin                        0
subvillage                 470
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population               26834
public_meeting            4155
recorded_by                  0
scheme_management         4846
scheme_name              35258
permit                    3793
construction_year        25969
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [9]:
# fill numerical null-values by mean/median

int_var.append('latitude')

# 1st step: group by region, ward (-> null-values remain if there does not exist a single non-null value in a tuple)
for var in int_var:
    data[var].fillna(data.groupby(['region', 'ward'])[var].transform("median"), inplace=True)

for var in float_var:
    data[var].fillna(data.groupby(['region', 'ward'])[var].transform("mean"), inplace=True)
    
# 2nd step: rougher filter
for var in int_var:
    data[var].fillna(data.groupby(['region'])[var].transform("median"), inplace=True)

for var in float_var:
    data[var].fillna(data.groupby(['region'])[var].transform("mean"), inplace=True)

# 3rd step: rougher filter
for var in int_var:
    data[var].fillna(data[var].median(), inplace=True)

for var in float_var:
    data[var].fillna(data[var].mean(), inplace=True)

In [10]:
data.isnull().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    4504
gps_height                   0
installer                 4532
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 470
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            4155
recorded_by                  0
scheme_management         4846
scheme_name              35258
permit                    3793
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [11]:
# create new feature that gives information about operational time
    
data['date_recorded'] = pd.to_datetime(data['date_recorded'])
data['operation_years'] = data.date_recorded.dt.year - data.construction_year
data['operation_years'] = data['operation_years'].astype(int)

In [12]:
# scale numeric features

#num_features=['latitude','longitude','operation_years','amount_tsh', 'gps_height', 'population']
#scaler = MinMaxScaler()

#for s in split:
    #s[num_features] = scaler.fit_transform(s[num_features])

In [13]:
# drop redundant features and features that do not seem to have an impact

data.drop(['extraction_type_group','extraction_type_class','quality_group','source_class','source_type','waterpoint_type_group','quantity_group','date_recorded','wpt_name','num_private','recorded_by'],axis=1,inplace=True)

In [14]:
data['funder'].value_counts()
data['installer'].value_counts()
data['scheme_name'].value_counts()

K                                        858
None                                     794
Borehole                                 704
Chalinze wate                            501
M                                        490
DANIDA                                   483
Government                               395
Ngana water supplied scheme              335
wanging'ombe water supply s              323
Bagamoyo wate                            296
wanging'ombe supply scheme               284
I                                        281
Uroki-Bomang'ombe water sup              266
N                                        258
Kirua kahe gravity water supply trust    237
Machumba estate pipe line                225
Makwale water supplied sche              209
Kijiji                                   205
Handeni Trunk Main(H                     204
S                                        188
Losaa-Kia water supply                   184
mtwango water supply scheme              183
Mkongoro T

In [15]:
train_df = data[data["train"] == 1]
test_df = data[data["train"] == 0]

train_df.drop(["train"], axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [16]:
pd.DataFrame(train_df).to_csv("train_cleaned.csv")
pd.DataFrame(test_df).to_csv("test_cleaned.csv")