## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
pd.set_option('display.max_columns', None)

## Load training data

In [None]:
train = pd.read_csv('./Data/train_cleaned_v0.2.csv')
test = pd.read_csv('./Data/test_cleaned_v0.2.csv')

labels = pd.read_csv('./Data/training_set_labels.csv')
train = train.merge(labels, on="id")
target = train.pop("status_group")

train['train'] = 1
test['train'] = 0

data = pd.concat([train,test])
data.head()

## Data preparation

In [None]:
engineered_features = ['amount_tsh','longitude','latitude','population','construction_year','gps_height','operation_years']

# drop unnecessary features
# drop 2 out of 3 from (mean/median, normal distribution, random choice)
for feature in engineered_features:
    #data[feature] = data['_'.join([feature,'imp_random_choice'])]
    #data[feature] = data['_'.join([feature,'imp_normal'])]
    data.drop(['_'.join([feature,'imp_normal'])], axis=1, inplace=True)
    data.drop(['_'.join([feature,'imp_random_choice'])], axis=1, inplace=True)

# optional: drop additional columns    
# data.drop(['region_code','lga','district_code','scheme_name'],axis=1,inplace=True)

In [None]:
# scale numeric features (optional, not necessary for tree-based methods)

#num_features=['latitude','longitude','operation_years', 'gps_height', 'population','amount_tsh','construction_year']
#scaler = MinMaxScaler()

#data[num_features] = scaler.fit_transform(data[num_features])

In [None]:
# factorize features for evaluations

cat_features = ['funder','installer','basin','region','public_meeting','scheme_management','permit','extraction_type','management','payment_type','water_quality','payment_type','quantity','source','waterpoint_type','ward','subvillage','lga','scheme_name']

for var in cat_features:
    data[var].replace(np.nan, 'Missing', inplace=True)
    data[var] = pd.factorize(data[var])[0]

In [None]:
# extract training/test sets

train_df = data[data["train"] == 1]
test_df = data[data["train"] == 0]
train_df.drop(["train"], axis=1, inplace=True)
train_df.drop(['id'],axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

id_test = test_df['id']
test_df.drop(['id'],axis=1, inplace=True)

## Train/Test split

In [None]:
# train/test split (not necessary if desired output is submission file)

#from sklearn.model_selection import train_test_split

#X = train
#Y = target
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7, stratify=Y)

## Create random grid

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

## Random grid to search for best hyperparameters

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Use the random grid to search for best hyperparameters
# Create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=20, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(train_df, target)

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_score_

## Evaluate Random Search

In [None]:
# not necessary if desired output is submission file

#def evaluate(model, test_set, test_labels):
#    predictions = model.predict(test_set)
#    evaluation_df = pd.DataFrame(list())
#    evaluation_df['true_values'] = list(test_labels)
#    evaluation_df['predicted_values'] = list(predictions)
#    correct_predictions = len(evaluation_df[evaluation_df['true_values'] == evaluation_df['predicted_values']])
#    classification_rate = correct_predictions / len(predictions)
    
#    print('Model Performance')
#    print('Accuracy = {:0.4f}%.'.format(classification_rate))
    
#    return classification_rate

In [None]:
best_random = rf_random.best_estimator_
best_random.fit(train_df, target)


# following not possible if desired output is submission file

#base_model = RandomForestClassifier(n_estimators = 1000, random_state = 42)
#base_model.fit(X_train, Y_train)
#base_accuracy = evaluate(base_model, X_test, Y_test)

#random_accuracy = evaluate(best_random, X_test, Y_test)

#print('Improvement of {:0.4f}%.'.format(random_accuracy - base_accuracy))

In [None]:
predictions = best_random.predict(test_df)

predictions = pd.DataFrame(predictions)
predictions['id'] = id_test
predictions.columns = ['status_group','id']
predictions = predictions[['id','status_group']]

In [None]:
# convert into submission format

formatsub = pd.read_csv('./Data/submission_format.csv')
submission_format = pd.merge(formatsub, predictions, on=['id'], how='inner')
submission_format.drop(['status_group_x'],axis=1,inplace=True)
submission_format.columns = ['id','status_group']

submission_format.to_csv('./Results/submission_format_rename.csv', index=False)
submission_format.head()

## Get Feature Importances (if applicable)

In [None]:
# Get numerical feature importances
importances = list(best_random.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(train_df.columns), importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
for pair in feature_importances:
    print('Variable: {:20} Importance: {}'.format(*pair))