In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
import os
import math
import re
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

import warnings
warnings.filterwarnings('ignore')

In [2]:
BASE_DIR = '../'

train_path = os.path.join(BASE_DIR, 'data', 'bank_dataset', 'bank_train.csv')
test_path  = os.path.join(BASE_DIR, 'data', 'bank_dataset', 'bank_test.csv')
val_path   = os.path.join(BASE_DIR, 'data', 'bank_dataset', 'bank_val.csv')

In [5]:
# Part C Random Forest
def load_data_rf(numeric_cols, filename, values_dict = {}):
    df = pd.read_csv(filename, delimiter = ';')
    Y = df['y'].copy()
    Y  = Y.to_numpy()
    for i in range(Y.shape[0]):
        if Y[i] == 'yes':
            Y[i] = 1
        else:
            Y[i] = 0 #Assigning 0 to nan values
    
    Y = Y.astype('int64')
    df = df.drop(['y'],axis=1)
                
    if values_dict == {}:
        for col in df.columns:
            if col not in numeric_cols:
                values = list(set(list(df[col])))
                values_dict[col] = values                
                for i in range(df.shape[0]):
                    temp = df[col][i]
                    df[col][i] = np.zeros(len(values))
                    df[col][i][values_dict[col].index(temp)] = 1
        return df, Y, values_dict
    
    else:
        for col in df.columns:
            if col not in numeric_cols:
                for i in range(df.shape[0]):
                    temp = df[col][i]
                    df[col][i] = np.zeros(len(values_dict[col]))
                    if temp in values_dict[col]:
                        df[col][i][values_dict[col].index(temp)] = 1                        
        return df, Y    

In [6]:
PART = 'c'
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

print("Loading train data...")
Xtrain_rf, Ytrain_rf, values_dict_rf = load_data_rf(numeric_cols, train_path, {})

print("Loading test data...")
Xtest_rf, Ytest_rf = load_data_rf(numeric_cols, test_path, values_dict_rf)

print("Loading val data...")
Xval_rf, Yval_rf = load_data_rf(numeric_cols, val_path, values_dict_rf)

Loading train data...


KeyboardInterrupt: 

In [None]:
best_params = {
    "num_estimators": 14,
    "bootstrap": True,
    "num_features": 52,
    "accuracy": -1
}

n_estimators = [50, 150, 250, 350, 450]
max_features = [0.1, 0.3, 0.5, 0.7, 0.9]
min_samples_split = [2, 4, 6, 8, 10]

bootstrap_list = [True, False]
max_features_list = np.arange(1,Xtrain_rf.shape[1])

In [None]:
for n in n_estimators:
    for bs in bootstrap_list:
        for m in max_features_list:
            rf = RandomForestClassifier(n_estimators=n, criterion="entropy", bootstrap=bs, max_features=m, min_samples_split=10)
            rf.fit(Xtrain_rf, Ytrain_rf.reshape(-1))
            acc = accuracy_score(Yval_rf, np.array(rf.predict(Xval_rf), dtype=int))
            if acc > best_params['accuracy']:
                print("Accuracy: ", acc)                
                best_params['num_estimators'] = n
                best_params['bootstrap'] = bs
                best_params['num_features'] = m
                best_params['accuracy'] = acc

print(best_params)