In [0]:
import pandas as pd
import numpy as np
import random

from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split

from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import lightgbm as lgb
import os
import io

In [6]:
from google.colab import files
uploaded = files.upload()

Saving train_FE_Final.csv to train_FE_Final.csv
Saving test_FE_Final.csv to test_FE_Final.csv


In [0]:
train = pd.read_csv(io.BytesIO(uploaded['train_FE_Final.csv']))
test = pd.read_csv(io.BytesIO(uploaded['test_FE_Final.csv']))

In [0]:
# Drop unwanted column
train = train.drop("Unnamed: 0", axis = 1)
test = test.drop("Unnamed: 0", axis = 1)

# We won't touch test until predicting for submission

# We don't want the ID to be used in the model so preserve
train_id = train['id']
test_id = test['id']

train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

# Split X and y
X = train.loc[:, train.columns != "satisfied"]
y = train.satisfied

In [9]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 3096)

# Instantiate model default params
rf = RandomForestClassifier(random_state = 3096)

# Train the model on training data
rf.fit(X_train, y_train);

# Use the forest's predict method on the test data
expected_y = y_test
predicted_y = rf.predict(X_test)

# Summarize model fit
print(); print(metrics.classification_report(expected_y, predicted_y))
print(); print(metrics.confusion_matrix(expected_y, predicted_y))
print(); print(metrics.roc_auc_score(expected_y, predicted_y))


              precision    recall  f1-score   support

           0       0.80      0.78      0.79      4291
           1       0.80      0.82      0.81      4733

    accuracy                           0.80      9024
   macro avg       0.80      0.80      0.80      9024
weighted avg       0.80      0.80      0.80      9024


[[3338  953]
 [ 842 3891]]

0.8000036978127708


In [10]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [13]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 30, cv = 3, verbose=10, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   59.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 28.2min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed: 36.1min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed: 44.7min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 56.6min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 62.4min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [0]:
import pickle
# save the model to disk
filename = 'rf_model.sav'
pickle.dump(rf_random, open(filename, 'wb'))

In [0]:
# Predict
preds = rf_random.predict(X_test)

In [23]:
# AUC calculation
best_random = rf_random.best_estimator_
random_accuracy = metrics.roc_auc_score(y_test, preds)
random_accuracy

0.8077622112388594

In [0]:
train_label = train['satisfied']
del train['satisfied']

# Two sets of predictions: one on the training set (which we can use as a feature), one on the test set 
train_new = train.copy()
test_new = test.copy()

train_preds = pd.DataFrame(rf_random.predict_proba(train_new))
test_preds = pd.DataFrame(rf_random.predict_proba(test_new))

train_new['Predicted_RF'] = train_preds[1]
test_new['Predicted_RF'] = test_preds[1]

train_new['id'] = train_id
test_new['id'] = test_id

In [0]:

train_new[['id', 'Predicted_RF']].to_csv('train_fe_data_Final_RF_no_rounding.csv', 
                                         index = False, float_format = "%.8f")
test_new[['id', 'Predicted_RF']].to_csv('test_fe_data_Final_RF_no_rounding.csv', 
                                        index = False, float_format = "%.8f")