## Contest Submission

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, f1_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [2]:
# Import preprocessed training data
train_df = pd.read_csv('train_for_models.csv')
train_df.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,population,public_meeting,scheme_management,permit,construction_year,extraction_type_class,payment_type,water_quality,quantity,source,waterpoint_type,status_group
0,6000.0,995,other,1390,other,Lake Nyasa,109,True,VWC,False,1990s,gravity,annually,soft,enough,spring,communal standpipe,functional
1,0.0,272,other,1399,other,Lake Victoria,280,unknown,other,True,2010s,gravity,never pay,soft,insufficient,rainwater harvesting,communal standpipe,functional
2,25.0,281,other,686,other,Pangani,250,True,VWC,True,2000s,gravity,per bucket,soft,enough,dam,communal standpipe multiple,functional
3,0.0,309,unicef,263,other,Ruvuma / Southern Coast,58,True,VWC,True,1980s,submersible,never pay,soft,dry,machine dbh,communal standpipe multiple,non functional
4,0.0,874,other,0,other,Lake Victoria,0,True,other,True,unknown,gravity,never pay,soft,seasonal,rainwater harvesting,communal standpipe,functional


In [3]:
# Splitting the outcome and the predictor variables
target = train_df['status_group']
train_df = train_df.drop('status_group', axis=1)

In [4]:
# Create dummy variables
train_df = pd.get_dummies(train_df)
train_df.head()

Unnamed: 0,amount_tsh,days_since_recorded,gps_height,population,funder_danida,funder_gov,funder_hesawa,funder_kkkt,funder_other,funder_rwssp,...,source_shallow well,source_spring,source_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
0,6000.0,995,1390,109,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
1,0.0,272,1399,280,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,25.0,281,686,250,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,0.0,309,263,58,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0.0,874,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.25, random_state=42)

In [6]:
# Creating a Random Forest model using the best parameters I found
best_forest=RandomForestClassifier(criterion='entropy', max_depth=None, min_samples_leaf=3, min_samples_split=5, n_estimators=100)
best_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [7]:
best_forest.score(X_train, y_train)

0.8426487093153759

In [8]:
best_forest.score(X_test, y_test)

0.7952188552188553

In [9]:
best_forest_preds = best_forest.predict(X_test)
print(classification_report(y_test, best_forest_preds))

                         precision    recall  f1-score   support

             functional       0.77      0.92      0.84      8098
functional needs repair       0.65      0.22      0.33      1074
         non functional       0.85      0.72      0.78      5678

               accuracy                           0.80     14850
              macro avg       0.76      0.62      0.65     14850
           weighted avg       0.79      0.80      0.78     14850



In [10]:
# import test set and submission form
test_set = pd.read_csv('test_for_models.csv')
test_set.head()

Unnamed: 0,id,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,population,public_meeting,scheme_management,permit,construction_year,extraction_type_class,payment_type,water_quality,quantity,source,waterpoint_type
0,50785,0.0,302,other,1996,other,Internal,321,True,parastatal,True,2010s,other,never pay,soft,seasonal,rainwater harvesting,other
1,51630,0.0,302,gov,1569,dwe,Pangani,300,True,VWC,True,2000s,gravity,never pay,soft,insufficient,spring,communal standpipe
2,17168,0.0,305,other,1567,other,Internal,500,True,VWC,unknown,2010s,other,never pay,soft,insufficient,rainwater harvesting,other
3,45559,0.0,315,other,267,other,Ruvuma / Southern Coast,250,unknown,VWC,True,1980s,other,unknown,soft,dry,shallow well,other
4,49871,500.0,251,other,1260,other,Ruvuma / Southern Coast,60,unknown,water_board,True,2000s,gravity,monthly,soft,enough,spring,communal standpipe


In [11]:
test_set = pd.get_dummies(test_set)
test_set.head()

Unnamed: 0,id,amount_tsh,days_since_recorded,gps_height,population,funder_danida,funder_gov,funder_hesawa,funder_kkkt,funder_other,...,source_shallow well,source_spring,source_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
0,50785,0.0,302,1996,321,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,51630,0.0,302,1569,300,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,17168,0.0,305,1567,500,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,45559,0.0,315,267,250,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,49871,500.0,251,1260,60,0,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0


In [12]:
test_set_1 = test_set.drop(['id', 'installer_danida'], axis=1)

In [13]:
test_set_predict = best_forest.predict(test_set_1)

In [14]:
test_set['status_group'] = test_set_predict

In [15]:
test_set.head()

Unnamed: 0,id,amount_tsh,days_since_recorded,gps_height,population,funder_danida,funder_gov,funder_hesawa,funder_kkkt,funder_other,...,source_spring,source_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,status_group
0,50785,0.0,302,1996,321,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,functional
1,51630,0.0,302,1569,300,0,1,0,0,0,...,1,0,0,1,0,0,0,0,0,functional
2,17168,0.0,305,1567,500,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,functional
3,45559,0.0,315,267,250,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,non functional
4,49871,500.0,251,1260,60,0,0,0,0,1,...,1,0,0,1,0,0,0,0,0,functional


In [18]:
submission_form = test_set[['id', 'status_group']].copy()
submission_form.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [19]:
submission_form.to_csv('submission_form.csv', index=False)