In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import calibration_curve, CalibrationDisplay
from sklearn.metrics import roc_curve, auc



In [None]:
def XGB(X,y):
    #Train and valid split
    X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # XGB Classifier
    clf = xgb.XGBClassifier()
    y_train = y_train.values.ravel()
    clf.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = clf.predict(X_val)
    
    #Probability estimates
    pred_probs = clf.predict_proba(X_val)
    
    
    #Model Evaultion Metrics
    accuracy = metrics.accuracy_score(y_val, y_pred)
    f1_score = metrics.f1_score(y_val, y_pred)
    precision = metrics.precision_score(y_val, y_pred)
    recall = metrics.recall_score(y_val, y_pred)
    cf_matrix = metrics.confusion_matrix(y_val,y_pred)
    
    probs_isgoal = pred_probs[:,1]
    roc_auc = metrics.roc_auc_score(y_val,probs_isgoal)
    
    
    
        
    return pred_probs, accuracy,f1_score, precision, recall, roc_auc, cf_matrix

In [None]:
# Read in data and assign X and y
data = pd.read_csv('../../data/train.csv', index_col=0)
X = data[data.columns.tolist()[:-1]]
y = data[['isGoal']]
X

In [None]:
#Feature Selection: Low varience
from sklearn.feature_selection import VarianceThreshold

var_thres=VarianceThreshold(threshold=0.25)
var_thres.fit_transform(X)

#True: High Variance False: Low Variance
new_cols = var_thres.get_support() 
print(new_cols) 

#Low variance columns
concol = [column for column in X.columns 
          if column not in X.columns[var_thres.get_support()]]

for features in concol:
    print(features)

#Dropping Low Variance Columns (75% or more similar) 
X = X.iloc[:,new_cols]
print('total columns after feature selection:', len(X.columns))
X.shape

In [None]:
pred_probs, accuracy,f1_score, precision, recall, roc_auc, cf_matrix = XGB(X, y)
print(f' accuracy: {accuracy}')
print(f' f1_score: {f1_score}')
print(f' precision: {precision}')
print(f' recall: {recall}')
print(f' roc_auc: {roc_auc}')
print(cf_matrix)