# 3950 Assignment 1: Part 2 #

In [1]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier

# Seaborn for fancy plots 
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.rcParams ["figure.figsize"] = (8,8)

### Loading Data ###

In [2]:
name = "Songhyun Lee"
show_eda = False 

df = pd.read_csv ("training.csv")
df = df.drop (columns = {"id"})
df.sample(5)

Unnamed: 0,target,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,var_200
104,1,0.706,0.219,0.331,0.979,0.274,0.117,0.093,0.584,0.038,...,0.08,0.657,0.22,0.08,0.542,0.967,0.445,0.193,0.096,0.885
114,1,0.35,0.97,0.037,0.988,0.868,0.466,0.642,0.141,0.756,...,0.836,0.436,0.742,0.945,0.656,0.503,0.211,0.778,0.689,0.057
118,1,0.687,0.969,0.013,0.149,0.122,0.497,0.177,0.655,0.854,...,0.76,0.719,0.721,0.517,0.41,0.827,0.474,0.28,0.308,0.822
207,1,0.988,0.257,0.065,0.737,0.623,0.694,0.983,0.552,0.719,...,0.889,0.508,0.331,0.257,0.984,0.225,0.422,0.512,0.884,0.784
161,1,0.654,0.273,0.978,0.111,0.16,0.113,0.641,0.834,0.108,...,0.218,0.493,0.456,0.644,0.535,0.436,0.196,0.97,0.175,0.916


### Data Preparation ###

In [5]:
y = np.array(df["target"]).reshape(-1,1)
x = np.array(df.drop(columns = {"target"}))
x_train, x_test, y_train, y_test = train_test_split(x,y)

### Modelling ###

In [21]:
def sklearn_to_df (sklearn_dataset):
    df = pd.DataFrame(sklearn_dataset.data, columns = sklearn_dataset.feature_names)
    df['target'] = pd.Series(sklearn_dataset.target)
    return df 

from sklearn.model_selection import GridSearchCV

scaler = StandardScaler()
estimator = RandomForestClassifier(n_jobs=-1)
pipe = Pipeline(steps=[("scaler", scaler), ("forrest", estimator)])

rf_para = {'min_samples_split':[8,9,10,11,12],
            'max_depth':[6,7,8,9,10],
            'n_estimators':[100,120,140],
            'criterion':["gini","entropy"],
            'max_samples':[.5, .6, .7, .8]}

clf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_para, cv=10, n_jobs=4) 
clf.fit(x_train, y_train.ravel())
clf.best_estimator_

### Finishing ###

In [22]:
best = clf.best_estimator_
print(best.score(x_test, y_test))
print(best)

0.6190476190476191
RandomForestClassifier(criterion='entropy', max_depth=7, max_samples=0.5,
                       min_samples_split=8, n_estimators=140)


### Testing ###

In [23]:
test_df = pd.read_csv("testing.csv")
test_df = test_df.drop(columns={"id"})

#Create tests and score
test_y = np.array(test_df["target"]).reshape(-1,1)
test_X = np.array(test_df.drop(columns={"target"}))

preds = best.predict(test_X)

roc_score = roc_auc_score(test_y, preds)
acc_score = accuracy_score(test_y, preds)

print(roc_score)
print(acc_score)
print(name, np.mean([roc_score, acc_score]))

0.6147738393571112
0.614126582278481
Songhyun Lee 0.614450210817796


### What Accuracy Changes were Used ###

- Grid Search 
- Cross Validation 
- StandardScaler 
- The hyperparameter grid for the RandomForestClassifier ws adjusted to fine-tune the model's performance, resulting in improved accuracy. 