### RF classifier with RFE feature selection method

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import calibration_curve, CalibrationDisplay
from sklearn.metrics import roc_curve, auc
import joblib
from plot_metrics import *

In [None]:
def DT_best_params(X,y, params):
    #Train and valid split
    X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
   #Model Fitting
    clf = DecisionTreeClassifier(**params)
    y_train = y_train.values.ravel()
    clf.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = clf.predict(X_val)
    
    #Probability estimates
    pred_probs = clf.predict_proba(X_val)
    probs_isgoal = pred_probs[:,1]
    
    #Model Evaultion Metrics
    accuracy = metrics.accuracy_score(y_val, y_pred)
    f1_score = metrics.f1_score(y_val, y_pred)
    precision = metrics.precision_score(y_val, y_pred)
    recall = metrics.recall_score(y_val, y_pred)
    cf_matrix = metrics.confusion_matrix(y_val,y_pred)
    roc_auc = metrics.roc_auc_score(y_val,probs_isgoal)
    
    #ROC AUC Curve
    plot_ROC(y_val, pred_probs)
        
    #Goal Rate Plot
    df_percentile =  calc_percentile(pred_probs, y_val)
    goal_rate_df = goal_rate(df_percentile)
    plot_goal_rates(goal_rate_df)
        
    #Cumulative Goal Rate Plot
    plot_cumulative_goal_rates(df_percentile)
        
    #Calibration Curve
    plot_calibration_curve_prediction(y_val, pred_probs)   
    
        
    return pred_probs, accuracy,f1_score, precision, recall, roc_auc, cf_matrix

In [None]:
# Read in data and assign X and y
data = pd.read_csv('../../../data/train.csv', index_col=0)
X = data[data.columns.tolist()[:-1]]
y = data[['isGoal']]
X

### Feature Selection: Recursive feature elimination

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

col_list = X.columns.to_list()
feature_names = np.array(X.columns.to_list())
print("Feature Names:", feature_names)


X = X.copy()
tot_col = len(X.columns)

y_flatten = y.values.ravel()

model = RandomForestClassifier()


rfe = RFE(model, n_features_to_select=15)
fit = rfe.fit(X, y_flatten)

print("Num Features: ", fit.n_features_)
#print("Selected Features Mask: ", fit.support_)
#print("Feature Ranking: ", fit.ranking_)

selected_features = feature_names[fit.support_]
#Selected feature names
print('Selected feature names: ', selected_features)

features_dropped = set(col_list).difference(selected_features)
print('features_dropped: ', features_dropped)

In [None]:
X_new = X[selected_features]
print('Shape of new X', X_new.shape)

### Loading the the best  model from gridsearch for DT classifier

In [None]:
dt_grid_search_model = joblib.load("../dt_random_best_model.pkl")
dt_best_params = dt_grid_search_model.best_params_
dt_best_params

In [None]:
pred_probs, accuracy,f1_score, precision, recall, roc_auc, cf_matrix = DT_best_params(X_new, y, dt_best_params)
print(f' accuracy: {accuracy}')
print(f' f1_score: {f1_score}')
print(f' precision: {precision}')
print(f' recall: {recall}')
print(f' roc_auc: {roc_auc}')
print(cf_matrix)

In [None]:
disp = metrics.ConfusionMatrixDisplay(confusion_matrix = cf_matrix)
disp.plot()