In [579]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

In [580]:
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
labels = pd.read_csv('PumpItUp_Labels.csv')
train = train.merge(labels, on='id')
target=train.pop('status_group')

train['train'] = 1
test['train'] = 0

train.drop('Unnamed: 0',inplace=True,axis=1)
test.drop('Unnamed: 0',inplace=True,axis=1)

total = pd.concat([train,test])

In [581]:
# advanced feature selection

total.drop(['subvillage','region_code','lga','ward','district_code','scheme_name'],axis=1,inplace=True)

In [582]:
total.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,region,population,...,construction_year,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,operation_years,train
0,21669,777.777778,Others,1715.0,DWE,31.366025,-7.97201,Lake Tanganyika,Rukwa,1.0,...,1974.0,ksb,vwc,never pay,soft,enough,spring,communal standpipe,37,1
1,60108,1352.941176,Others,1906.0,DWE,31.431333,-7.850336,Lake Tanganyika,Rukwa,1.0,...,1976.0,gravity,vwc,never pay,soft,enough,river,communal standpipe,35,1
2,44820,777.777778,Others,1741.0,DWE,31.368317,-7.971081,Lake Tanganyika,Rukwa,1.0,...,1974.0,ksb,vwc,never pay,soft,enough,spring,communal standpipe,37,1
3,53307,777.777778,Others,1730.0,DWE,31.36873,-7.972788,Lake Tanganyika,Rukwa,1.0,...,1974.0,ksb,vwc,never pay,soft,enough,spring,communal standpipe,37,1
4,5701,777.777778,Others,1731.0,DWE,31.365461,-7.970985,Lake Tanganyika,Rukwa,1.0,...,1974.0,ksb,vwc,annually,soft,enough,spring,communal standpipe,37,1


In [583]:
# get information about numerical data

total.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,population,construction_year,operation_years,train
count,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0
mean,37124.5,1241.682018,1078.416653,35.091203,-5.79547,288.755872,1997.635111,14.140929,0.8
std,21434.273081,3508.83224,513.919946,2.58926,2.8085,466.182953,10.178555,10.278036,0.400003
min,0.0,0.2,-90.0,29.607122,-11.64944,1.0,1960.0,-7.0,0.0
25%,18562.25,200.0,833.0,33.23447,-8.525675,80.0,1995.0,7.0,1.0
50%,37124.5,1000.0,1192.0,34.907475,-5.02654,200.0,1999.5,11.0,1.0
75%,55686.75,1241.682018,1337.0,37.181685,-3.352929,400.0,2004.0,16.0,1.0
max,74249.0,350000.0,2777.0,40.345193,-0.998464,30500.0,2013.0,53.0,1.0


In [584]:
# discretize numerical features

# bin_amounttsh
bin_amounttsh = [-0.1,200,1000,1241,350000]
category = pd.cut(total['amount_tsh'],bin_amounttsh)
total['amount_tsh'] = category

# gps_height
bin_gpsheight = [-90.1,833,1192,1337,2777]
category = pd.cut(total['gps_height'],bin_gpsheight)
total['gps_height'] = category

# longitude
bin_longitude = [29.5,33.2,34.9,37.1,40.4]
category = pd.cut(total['longitude'],bin_longitude)
total['longitude'] = category

# latitude
bin_latitude = [-11.7,-8.6,-5.1,-3.4,-0.9]
category = pd.cut(total['latitude'],bin_latitude)
total['latitude'] = category

# population
bin_population = [0,80,200,400,30500]
category = pd.cut(total['population'],bin_population)
total['population'] = category

# construction_year
bin_construction = [1959,1995,1999,2004,2013]
category = pd.cut(total['construction_year'],bin_construction)
total['construction_year'] = category

# operation_years
bin_operation = [-7.1,7,11,16,53]
category = pd.cut(total['operation_years'],bin_operation)
total['operation_years'] = category

In [585]:
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74250 entries, 0 to 14849
Data columns (total 23 columns):
id                   74250 non-null int64
amount_tsh           74250 non-null category
funder               74250 non-null object
gps_height           74250 non-null category
installer            74250 non-null object
longitude            74250 non-null category
latitude             74250 non-null category
basin                74250 non-null object
region               74250 non-null object
population           74250 non-null category
public_meeting       70095 non-null object
scheme_management    69404 non-null object
permit               70457 non-null object
construction_year    74250 non-null category
extraction_type      74250 non-null object
management           74250 non-null object
payment_type         74250 non-null object
water_quality        74250 non-null object
quantity             74250 non-null object
source               74250 non-null object
waterpoint_type     

In [586]:
# fill missing categorical data

total['public_meeting'].replace(np.nan, 'Missing', inplace=True)
total['scheme_management'].replace(np.nan, 'Missing', inplace=True)
total['permit'].replace(np.nan, 'Missing', inplace=True)

# rename feature values that do not exist in test dataset

total['scheme_management'].replace('None', 'Missing', inplace=True)
total['extraction_type'].replace('other - mkulima/shinyanga', 'other', inplace=True)
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74250 entries, 0 to 14849
Data columns (total 23 columns):
id                   74250 non-null int64
amount_tsh           74250 non-null category
funder               74250 non-null object
gps_height           74250 non-null category
installer            74250 non-null object
longitude            74250 non-null category
latitude             74250 non-null category
basin                74250 non-null object
region               74250 non-null object
population           74250 non-null category
public_meeting       74250 non-null object
scheme_management    74250 non-null object
permit               74250 non-null object
construction_year    74250 non-null category
extraction_type      74250 non-null object
management           74250 non-null object
payment_type         74250 non-null object
water_quality        74250 non-null object
quantity             74250 non-null object
source               74250 non-null object
waterpoint_type     

In [587]:
# test/train split

train_df = total[total["train"] == 1]
test_df = total[total["train"] == 0]
train_df.drop(["train"], axis=1, inplace=True)
train_df.drop(['id'],axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

id_test = test_df['id']
test_df.drop(['id'],axis=1, inplace=True)

train_df = train_df.astype(str)
test_df = test_df.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [588]:
# OneHotEncoding to get multivariate Bernoulli model

encoder = preprocessing.OneHotEncoder()
encoded_train = pd.DataFrame(encoder.fit_transform(train_df).toarray(), columns=encoder.get_feature_names(train_df.columns))
encoded_test = pd.DataFrame(encoder.fit_transform(test_df).toarray(), columns=encoder.get_feature_names(test_df.columns))

In [589]:
# perform Bernoulli Naive Bayes model

from sklearn.naive_bayes import BernoulliNB
model_nb = BernoulliNB()
model_nb.fit(encoded_train, target)

y_pred = model_nb.predict(encoded_test)

In [590]:
# transform into transmission format

y_pred = pd.DataFrame(y_pred)
y_pred['id'] = id_test
y_pred.columns = ['status_group','id']
y_pred = y_pred[['id','status_group']]

In [591]:
y_pred.head()

Unnamed: 0,id,status_group
0,58078,non functional
1,31089,functional
2,41075,functional
3,10188,functional
4,31352,functional


In [592]:
pd.DataFrame(y_pred).to_csv("submission_naivebayes.csv")