## Imports

In [63]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
pd.set_option('display.max_columns', None)

## Load training data

In [64]:
train = pd.read_csv('./Data/train_cleaned_v0.1.csv')

labels = pd.read_csv('./Data/training_set_labels.csv')
train = train.merge(labels, on="id")

target = train.pop("status_group")

train.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,operation_years
0,69572,6000.0,Roman,1390.0,Roman,34.938093,-9.856322,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109.0,True,VWC,Roman,False,1999.0,gravity,vwc,annually,soft,enough,spring,communal standpipe,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,109.0,109.0,1999.0,1999.0,6000.0,6000.0,12
1,8776,542.857143,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280.0,Missing,Other,Missing,True,2010.0,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,280.0,280.0,2010.0,2010.0,74.349311,1000.0,3
2,34310,25.0,Lottery Club,686.0,World vision,37.460664,-3.821329,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250.0,True,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,vwc,per bucket,soft,enough,dam,communal standpipe multiple,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,250.0,250.0,2009.0,2009.0,25.0,25.0,4
3,67743,525.0,Unicef,263.0,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58.0,True,VWC,Missing,True,1986.0,submersible,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,58.0,58.0,1986.0,1986.0,-409.294439,1000.0,27
4,19728,1062.351942,Action In A,1167.0,Artisan,31.130847,-1.825359,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,150.0,True,Missing,Missing,True,2000.0,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,31.130847,31.130847,-1.825359,-1.825359,1216.846742,1161.0,-333.803882,150.0,1996.732218,2008.0,1500.665788,300.0,11


## Advanced feature reduction and engineering

In [65]:
engineered_features = ['amount_tsh','longitude','latitude','population','construction_year','gps_height']

# drop unnecessary features
# drop 2 out of 3 from (mean/median, normal distribution, random choice)
for feature in engineered_features:
    train[feature] = train['_'.join([feature,'imp_random_choice'])]
    #train[feature] = train['_'.join([feature,'imp_normal'])]
    train.drop(['_'.join([feature,'imp_normal'])], axis=1, inplace=True)
    train.drop(['_'.join([feature,'imp_random_choice'])], axis=1, inplace=True)
#train.drop(['region_code','lga','district_code','scheme_name'],axis=1,inplace=True)
train.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,operation_years
0,69572,6000.0,Roman,1390.0,Roman,34.938093,-9.856322,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109.0,True,VWC,Roman,False,1999.0,gravity,vwc,annually,soft,enough,spring,communal standpipe,12
1,8776,1000.0,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280.0,Missing,Other,Missing,True,2010.0,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,3
2,34310,25.0,Lottery Club,686.0,World vision,37.460664,-3.821329,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250.0,True,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,vwc,per bucket,soft,enough,dam,communal standpipe multiple,4
3,67743,1000.0,Unicef,263.0,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58.0,True,VWC,Missing,True,1986.0,submersible,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,27
4,19728,300.0,Action In A,1161.0,Artisan,31.130847,-1.825359,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,150.0,True,Missing,Missing,True,2008.0,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,11


In [66]:
# scale numeric features

num_features=['latitude','longitude','operation_years', 'gps_height', 'population','amount_tsh']
scaler = MinMaxScaler()

train[num_features] = scaler.fit_transform(train[num_features])
train.head()

  return self.partial_fit(X, y)


Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,operation_years
0,69572,0.017142,Roman,0.517483,Roman,0.496455,0.168353,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,0.003541,True,VWC,Roman,False,1999.0,gravity,vwc,annually,soft,enough,spring,communal standpipe,0.316667
1,8776,0.002857,Grumeti,0.520629,GRUMETI,0.474167,0.892122,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,0.009148,Missing,Other,Missing,True,2010.0,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,0.166667
2,34310,7.1e-05,Lottery Club,0.271329,World vision,0.731374,0.734967,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,0.008164,True,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,vwc,per bucket,soft,enough,dam,communal standpipe multiple,0.183333
3,67743,0.002857,Unicef,0.123427,UNICEF,0.826875,0.046394,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,0.001869,True,VWC,Missing,True,1986.0,submersible,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,0.566667
4,19728,0.000857,Action In A,0.437413,Artisan,0.141899,0.922364,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0.004885,True,Missing,Missing,True,2008.0,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,0.3


In [67]:
# factorize features for evaluations

cat_features = ['funder','installer','basin','region','public_meeting','scheme_management','permit','extraction_type','management','payment_type','water_quality','payment_type','quantity','source','waterpoint_type','ward','subvillage','lga','scheme_name']

for var in cat_features:
    train[var].replace(np.nan, 'Missing', inplace=True)
    train[var] = pd.factorize(train[var])[0]

train.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,operation_years
0,69572,0.017142,0,0.517483,0,0.496455,0.168353,0,0,0,11,5,0,0,0.003541,0,0,0,0,1999.0,0,0,0,0,0,0,0,0.316667
1,8776,0.002857,1,0.520629,1,0.474167,0.892122,1,1,1,20,2,1,1,0.009148,1,1,1,1,2010.0,0,1,1,0,1,1,0,0.166667
2,34310,7.1e-05,2,0.271329,2,0.731374,0.734967,2,2,2,21,4,2,2,0.008164,0,0,2,1,2009.0,0,0,2,0,0,2,1,0.183333
3,67743,0.002857,3,0.123427,3,0.826875,0.046394,3,3,3,90,63,3,3,0.001869,0,0,1,1,1986.0,1,0,1,0,2,3,1,0.566667
4,19728,0.000857,4,0.437413,4,0.141899,0.922364,1,4,4,18,1,4,4,0.004885,0,2,1,1,2008.0,0,2,1,0,3,1,0,0.3


In [68]:
# delete 'id'
train.drop(['id'],axis=1, inplace=True)

## Train/Test split

In [69]:
# train/test split
from sklearn.model_selection import train_test_split

X = train
Y = target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7, stratify=Y)

## Create random grid

In [70]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


## Random grid to search for best hyperparameters

In [71]:
from sklearn.ensemble import RandomForestClassifier

# Use the random grid to search for best hyperparameters
# Create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=50, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  4.

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=50)

In [72]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 60,
 'bootstrap': False}

## Evaluate random search

In [38]:
def evaluate(model, test_set, test_labels):
    predictions = model.predict(test_set)
    evaluation_df = pd.DataFrame(list())
    evaluation_df['true_values'] = list(test_labels)
    evaluation_df['predicted_values'] = list(predictions)
    correct_predictions = len(evaluation_df[evaluation_df['true_values'] == evaluation_df['predicted_values']])
    classification_rate = correct_predictions / len(predictions)
    
    print('Model Performance')
    print('Accuracy = {:0.4f}%.'.format(classification_rate))
    
    return classification_rate

In [73]:
base_model = RandomForestClassifier(n_estimators = 1000, random_state = 42)
base_model.fit(X_train, Y_train)
base_accuracy = evaluate(base_model, X_test, Y_test)

best_random = rf_random.best_estimator_
best_random.fit(X_train, Y_train)
random_accuracy = evaluate(best_random, X_test, Y_test)

print('Improvement of {:0.4f}%.'.format(random_accuracy - base_accuracy))

Model Performance
Accuracy = 0.8085%.
Model Performance
Accuracy = 0.8109%.
Improvement of 0.0023%.


In [43]:
# Get numerical feature importances
importances = list(best_random.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(X_train.columns), importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
for pair in feature_importances:
    print('Variable: {:20} Importance: {}'.format(*pair))

Variable: quantity             Importance: 0.12
Variable: longitude            Importance: 0.09
Variable: latitude             Importance: 0.09
Variable: subvillage           Importance: 0.06
Variable: waterpoint_type      Importance: 0.06
Variable: gps_height           Importance: 0.05
Variable: extraction_type      Importance: 0.05
Variable: funder               Importance: 0.04
Variable: ward                 Importance: 0.04
Variable: population           Importance: 0.04
Variable: construction_year    Importance: 0.04
Variable: operation_years      Importance: 0.04
Variable: amount_tsh           Importance: 0.03
Variable: installer            Importance: 0.03
Variable: lga                  Importance: 0.03
Variable: scheme_name          Importance: 0.03
Variable: payment_type         Importance: 0.03
Variable: source               Importance: 0.03
Variable: region               Importance: 0.02
Variable: region_code          Importance: 0.02
Variable: district_code        Importanc