In [1]:
# required imports
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import NearMiss, ClusterCentroids, RandomUnderSampler
from sklearn.model_selection import RandomizedSearchCV

# input the pre-processed train data
df =  pd.read_csv('Train_updated.csv')

In [2]:
# create X and Y
Y = df.target
X = df.drop('target', axis = 1)

In [4]:
# apply random undersampling on X and Y
rus = RandomUnderSampler(random_state = 0)
X_rus,Y_rus = rus.fit_resample(X, Y)

## RandomForest RandomizedSearchCV ##

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 250, num = 3)]
# Number of features to consider at every split
#max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 15, num = 5)]
print(max_depth)
max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'bootstrap': bootstrap,
               'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf
               }

rfc=RandomForestClassifier(random_state=42)
clf = RandomizedSearchCV(rfc, random_grid, random_state=42, n_iter=100, cv=5, verbose=0, n_jobs=-1)
clf.fit(X_rus, Y_rus)

[5, 7, 10, 12, 15]


## RandomizedSearch on Logistic Regression ##

In [None]:
from sklearn import linear_model
from scipy.stats import uniform

logistic = linear_model.LogisticRegression()

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter distribution using uniform distribution
C = uniform(loc=0, scale=4)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

clf = RandomizedSearchCV(logistic, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)
clf.fit(X_rus, Y_rus)

## RandomizedSearch on Decision Tree

In [None]:
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier

param_dist = {"max_depth": [None, 2, 5, 10, 12, 15],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
clf = RandomizedSearchCV(tree, param_dist, cv=5)

# Fit it to the data
clf.fit(X_rus,Y_rus)


In [None]:
# read the pre-processed test data
X_test = pd.read_csv('Test_updated.csv')

In [None]:
# Predicting probabilities of each class and writing the results to 'predictions.csv' -- Including qc_score
test_pred = clf.best_estimator_.predict_proba(X_test)
pd.DataFrame(test_pred[:,1], columns=['target']).to_csv('predictions.csv', index=False)

In [None]:
# Dropping the qc_score to test the model without qc_score present in the data.
X_nq = X.drop('qc_score', axis = 1)
X_test_nq = X_test.drop('qc_score', axis = 1)

In [None]:
# Predicting probabilities of each class and writing the results to 'predictions.csv' -- Excluding qc_score
test_nq_pred = clf.best_estimator_.predict_proba(X_test_nq)
pd.DataFrame(test_pred[:,1], columns=['target']).to_csv('predictions.csv', index=False)

## Classification Report ##

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import PrecisionRecallDisplay

print("Training Set")
# replace test_pred with test_pred_nq when testing without qc_score
print(classification_report(Y,test_pred,digits=5))

display = PrecisionRecallDisplay.from_estimator(tree, X, Y, name="Random Forest")
_ = display.ax_.set_title("2-class Precision-Recall curve")