# Random Forest Model
* In this notebook we take the pre-selected features from exploratory analysis and do some further feature optimisation through recursive feature elimintation.

* We then proceed to train a random forest model with the selected features and evaluate its performance on the test set.

* In addition, we also try out some minority class compensation techniques (proportional class weighting as well as minority over-sampling).

In [18]:
import sys
sys.path.append('..')
import warnings

from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib notebook
plt.style.use('ggplot')

In [19]:
def eval_model(test_x,test_y, rfc):
    pred = rfc.predict(test_x)
    from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score

    print(confusion_matrix(test_y,pred))
    print('Acc: ',accuracy_score(test_y,pred))
    print('Kappa: ',cohen_kappa_score(test_y,pred))

In [20]:
selected_features = [
    'gps_height',
    'latitude',
    'longitude',
    'population',
    'amount_tsh',
    'age_at_measurement',
    'payment_type',
    'management_group',
    'quality_group',
    'region',
    'basin',
    'extraction_type_class',
    'quantity_group',
    'waterpoint_type_group',
    'source_type',
    'source_class'
]

In [21]:
from data_loading import *
data = load_dataset('../data')
train_df, test_df = (data.pipe(data_cleaning)
            .pipe(numeric_groundtruth)
            .pipe(construction_year_feature)
            .pipe(feature_tsh_per_capita)
            .pipe(categorical_encoding)
            .pipe(feature_removal, selected_features=selected_features)
            .pipe(split_data, test_size=0.25))

Label distribution in training set:  Counter({0: 23519, 2: 16750, 1: 2922})
Label distribution in testing set:  Counter({0: 7870, 2: 5518, 1: 1009})


In [22]:
train_x = train_df[selected_features]
test_x = test_df[selected_features]

train_y = train_df.status_group.as_matrix()
test_y = test_df.status_group.as_matrix()

# Use random forests to optimise the feature set using cross-validated recursive feature elimination.

In [23]:
algo = RandomForestClassifier(n_estimators=100)
selector = RFE(estimator=algo)

selector.fit(train_x, train_y)

print("Optimal number of features : %d" % selector.n_features_)

Optimal number of features : 8


In [24]:
import numpy as np
print('Discarded features by feature selector: ')
print(np.array(selected_features)[[not s for s in selector.get_support()]])
train_x = train_x[train_x.columns[selector.get_support()]]
test_x = test_x[test_x.columns[selector.get_support()]]

Discarded features by feature selector: 
['amount_tsh' 'payment_type' 'management_group' 'quality_group' 'region'
 'basin' 'source_type' 'source_class']


# Fit a large random forest classifier on selected features and evaluate it

In [25]:
np.random.seed(42)
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=4)
rfc.fit(train_x, train_y)
eval_model(test_x,test_y, rfc)

[[6881  232  757]
 [ 510  305  194]
 [1131  111 4276]]
Acc:  0.7961380843231228
Kappa:  0.6183221279712713


# Try balancing for minority class by automatically adjusting  class weights

In [26]:
np.random.seed(42)
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=4,class_weight='balanced_subsample')
rfc.fit(train_x, train_y)
eval_model(test_x,test_y, rfc)

[[6907  216  747]
 [ 534  295  180]
 [1167  102 4249]]
Acc:  0.7953740362575537
Kappa:  0.6156415025337256


# Try some minority class over-sampling to compensate for class imbalance issues

In [27]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(n_jobs=4, k_neighbors=5)
train_x_smote, train_y_smote = smote.fit_sample(train_x,train_y)

np.random.seed(42)
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=4)

rfc.fit(train_x_smote, train_y_smote)
eval_model(test_x,test_y, rfc)

[[6382  626  862]
 [ 381  450  178]
 [ 968  274 4276]]
Acc:  0.7715496283948045
Kappa:  0.5908600300785932


# Undersampling

In [28]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_sample(train_x, train_y)

np.random.seed(42)
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=4)

rfc.fit(X_resampled, y_resampled)
eval_model(test_x,test_y, rfc)

[[2011 4766 1093]
 [  60  892   57]
 [ 336 2715 2467]]
Acc:  0.3729943738278808
Kappa:  0.18735099287061874
