In [10]:
import pandas as pd
import numpy as np

In [11]:
x_train = pd.read_csv('WaterPumpData/features_train.csv')
y_train = pd.read_csv('WaterPumpData/labels_train.csv')

In [19]:
y_train['label'] = y_train.status_group.apply(lambda x: 1 if x == 'functional' else 0)

In [20]:
funder = x_train.funder.str.lower().value_counts()
funder = funder[funder > 250]
x_train['funder_mod'] = [i if i in funder else 'Unknown' for i in x_train.funder]

In [21]:
import random
from random import sample

validation_set = sample(list(x_train.id), len(x_train)//5)

In [22]:
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
def create_randomforest_model(feature_set, cat_var):
    categorical_bool = [True if i in cat_var else False for i in feature_set]
    le = preprocessing.LabelEncoder()
    xx_train = pd.get_dummies(x_train[feature_set], columns=cat_var)
    xxx_train = xx_train.loc[~x_train.id.isin(validation_set)]
    xxx_validate = xx_train.loc[x_train.id.isin(validation_set)]
    yy_train = y_train.loc[~y_train.id.isin(validation_set)].label
    yy_validate = y_train.loc[y_train.id.isin(validation_set)].label
    clf = RandomForestClassifier()
    clf.fit(xxx_train, yy_train)
    
    return clf, xxx_validate, pd.DataFrame(yy_validate)

In [23]:
feature_set_4 = ['amount_tsh', 'population', 'funder_mod', 'district_code', 'basin', 'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type', 'management', 'payment', 'water_quality', 'quantity', 'source', 'waterpoint_type']
categorical_variables = ['funder_mod', 'district_code', 'basin', 'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type', 'management', 'payment', 'water_quality', 'quantity', 'source', 'waterpoint_type']

model, xxx_validate, yy_validate = create_randomforest_model(feature_set_4, categorical_variables)

pred = model.predict(xxx_validate)
yy_validate['pred'] = pred
(yy_validate.pred == yy_validate.label).value_counts()


True     9389
False    2491
dtype: int64

In [26]:
feature_set_5 = ['amount_tsh', 'population', 'funder_mod', 'district_code', 'gps_height', 'region', 'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type_group', 'management_group', 'payment_type', 'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group']
categorical_variables = ['funder_mod', 'district_code', 'region', 'public_meeting', 'scheme_management', 'permit', 'extraction_type_group', 'management_group', 'payment_type', 'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group']

model, xxx_validate, yy_validate = create_randomforest_model(feature_set_5, categorical_variables)

pred = model.predict(xxx_validate)
yy_validate['pred'] = pred
(yy_validate.pred == yy_validate.label).value_counts()

True     9602
False    2278
dtype: int64

In [210]:
installer = x_train.funder.str.lower().value_counts()
installer = funder[funder > 250]
installer
x_train['installer_mod'] = [i if i in installer else 'Unknown' for i in x_train.installer]

In [213]:
# Test to see if additional features are having an impact

feature_set_6 = ['amount_tsh', 'population', 'waterpoint_type_group']
categorical_variables = [ 'waterpoint_type_group']

model, xxx_validate, yy_validate = create_randomforest_model(feature_set_6, categorical_variables)

pred = model.predict(xxx_validate)
yy_validate['pred'] = pred
(yy_validate.pred == yy_validate.label).value_counts()

True     7671
False    4209
dtype: int64

In [328]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

feature_set_7 = ['amount_tsh', 'population', 'funder_mod', 'district_code', 'gps_height', 'region', 'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type_group', 'management_group', 'payment_type', 'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group']
numeric_features = ['amount_tsh', 'population', 'gps_height', 'construction_year']
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

categorical_features = ['funder_mod', 'district_code', 'region', 'public_meeting', 'scheme_management', 'permit', 'extraction_type_group', 'management_group', 'payment_type', 'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ])

clf = Pipeline(steps=[('preprocessor', preprocessor)
                     ])

xx_train, xx_test, yy_train, yy_test = train_test_split(x_train[feature_set_7], y_train, test_size=0.2,
                                                    random_state=0)

clf.fit(xx_train[feature_set_7])


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['funder_mod',
                                                   'district_code', 'region',
                                                   'public_meeting',
                                                   'scheme_management',
                                                   'permit',
                                                   'extraction_type_group',
                                                   'management_group',
                                                   'payment_type',
                                                   'quality_group',
                                                   'quantity_group',
                           

In [329]:
xxx_train = pd.DataFrame(clf.transform(xx_train).todense())
xxx_test = pd.DataFrame(clf.transform(xx_test).todense())
yyy_train = pd.DataFrame(yy_train['label'])
yyy_test = pd.DataFrame(yy_test['label'])

In [310]:
from keras.models import Sequential, Model
from keras.layers import Dense, concatenate, Input
from tensorflow.keras.optimizers import Adagrad, Adam
from keras.callbacks import EarlyStopping

In [314]:
ip = Input(shape=xxx_train.shape[1], name='ip')
x1 = Dense(250, activation='relu')(ip)
x2 = Dense(100, activation='relu')(x1)
x3 = Dense(50, activation='relu')(x2)
x4 = Dense(10, activation='relu')(x3)
output = Dense(1, activation='sigmoid')(x4)

model = Model(inputs = ip, outputs=output)
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [338]:
model.fit(xxx_train, yyy_train, epochs=25, batch_size = 16)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fcb7a2cb890>

In [344]:
pred_raw = model.predict(xxx_test).flatten()
pred = [round(i) for i in pred_raw]

In [346]:
yyy_test['pred'] = pred
yyy_test['correct'] = (yyy_test.label == yyy_test.pred)

In [348]:
yyy_test.correct.value_counts()

True     9450
False    2430
Name: correct, dtype: int64

In [377]:
validation_set = sample(list(x_train.id), len(x_train)//5)

In [378]:
feature_set_8 = ['amount_tsh', 'population', 'funder_mod', 'district_code', 'gps_height', 'region', 'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type_group', 'management_group', 'payment_type', 'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group']
numeric_features = ['amount_tsh', 'population', 'gps_height', 'construction_year']
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

categorical_features = ['funder_mod', 'district_code', 'region', 'public_meeting', 'scheme_management', 'permit', 'extraction_type_group', 'management_group', 'payment_type', 'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ])

clf = Pipeline(steps=[('preprocessor', preprocessor)
                     ])
xx_train, xx_test, yy_train, yy_test = train_test_split(x_train[feature_set_7], y_train, test_size=0.2,
                                                    random_state=0)

clf.fit(xx_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['funder_mod',
                                                   'district_code', 'region',
                                                   'public_meeting',
                                                   'scheme_management',
                                                   'permit',
                                                   'extraction_type_group',
                                                   'management_group',
                                                   'payment_type',
                                                   'quality_group',
                                                   'quantity_group',
                           

In [379]:
xxx_train = pd.DataFrame(clf.transform(xx_train).todense())
xxx_test = pd.DataFrame(clf.transform(xx_test).todense())
yyy_train = pd.DataFrame(yy_train['label'])
yyy_test = pd.DataFrame(yy_test['label'])

In [380]:
grad_boost_model = HistGradientBoostingClassifier()
grad_boost_model.fit(xxx_train, yyy_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



HistGradientBoostingClassifier()

In [381]:
random_forest_model = RandomForestClassifier()
random_forest_model.fit(xxx_train, yyy_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



RandomForestClassifier()

In [382]:
ip = Input(shape=xxx_train.shape[1], name='ip')
x1 = Dense(250, activation='relu')(ip)
x2 = Dense(100, activation='relu')(x1)
x3 = Dense(50, activation='relu')(x2)
x4 = Dense(10, activation='relu')(x3)
output = Dense(1, activation='sigmoid')(x4)

nn_model = Model(inputs = ip, outputs=output)
nn_model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])
nn_model.fit(xxx_train, yyy_train, epochs=25, batch_size = 16)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fcb79517290>

In [383]:
grad_boost_pred = grad_boost_model.predict(xxx_test)
random_forest_pred = random_forest_model.predict(xxx_test)
nn_pred = nn_model.predict(xxx_test)

In [384]:
yyy_test['gb_pred'] = grad_boost_pred
yyy_test['rf_pred'] = random_forest_pred
yyy_test['nn_pred_raw'] = nn_pred

In [385]:
yyy_test['nn_pred'] = round(yyy_test.nn_pred_raw)

In [386]:
yyy_test

Unnamed: 0,label,gb_pred,rf_pred,nn_pred_raw,nn_pred
11524,1,1,1,0.568859,1.0
16731,0,0,0,0.039152,0.0
48776,0,1,1,0.526282,1.0
23300,1,1,1,0.901758,1.0
25270,0,1,0,0.130993,0.0
...,...,...,...,...,...
1572,1,1,1,0.777077,1.0
37939,0,1,1,0.636074,1.0
35407,0,0,0,0.185051,0.0
50590,0,0,0,0.017304,0.0


In [387]:
yyy_test["sum"] = yyy_test.gb_pred + yyy_test.rf_pred + yyy_test.nn_pred

In [388]:
yyy_test["consensus"] = yyy_test['sum'].apply(lambda x: 1 if x in [2, 3] else 0)

In [389]:
(yyy_test.consensus == yyy_test.label).value_counts()

True     9604
False    2276
dtype: int64

In [392]:
(yyy_test.rf_pred == yyy_test.label).value_counts()

True     9620
False    2260
dtype: int64