In [4]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, auc
import warnings
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
X = pd.read_csv('train_x_imputed.csv')
test_df = pd.read_csv('test_x_imputed.csv')

In [6]:

def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    return vif_data

vif_result = calculate_vif(X)

print("VIF Results:")
vif_result


VIF Results:


Unnamed: 0,Feature,VIF
0,C1,1.145346
1,C2,1.178723
2,C3,1.054502
3,C4,1.029372
4,C5,1.433526
5,C6,1.422959
6,C7,1.483278
7,C8,1.205354
8,N1,1.590867
9,N2,1.176109


In [9]:
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    return vif_data

def drop_high_vif_features(X, threshold=10):
    while True:
        vif_result = calculate_vif(X)
        max_vif_index = vif_result['VIF'].idxmax()
        if vif_result.loc[max_vif_index, 'VIF'] > threshold:
            X = X.drop(vif_result.loc[max_vif_index, 'Feature'], axis=1)
        else:
            break
    return X

df_no_corr = drop_high_vif_features(X)
len(df_no_corr.columns)

28

In [10]:
class ModelEvaluator:
    def __init__(self, algorithm, data, train_cols, test_cols, test_df):
        self.algorithm = algorithm
        self.X_train = data
        self.train_cols = train_cols
        self.test_cols = test_cols
        self.x_test = test_df


    def evaluate(self):
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(self.X_train[self.train_cols], self.X_train[self.test_cols])

        model = XGBClassifier()
        param_grid = {'n_estimators': [50, 100, 150],
                        'max_depth': [3, 5, 7],
                        'learning_rate': [0.1, 0.01, 0.001]}


        # Parameter tuning using GridSearchCV
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='roc_auc')
        grid_search.fit(X_resampled, y_resampled)

        # Best parameters and best score
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        # Cross-validation
        cv_scores = cross_val_score(grid_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring='roc_auc')

        # Final evaluation on test set
        self.x_test['Class_1_Probability'] = grid_search.predict_proba(self.x_test[self.train_cols])[:, 1]

        return best_params, self.x_test



# Example usage
X = pd.read_csv('train_x_imputed.csv')
train_cols = list(X.columns)
# train_cols.remove('Unique_ID')
train_cols.remove('Dependent_Variable')
test_cols = ['Dependent_Variable']  # Specify your target variable
test_df = pd.read_csv('test_x_imputed.csv')

# Evaluating XGBoost
xgb_evaluator = ModelEvaluator('XGBoost', X, train_cols, test_cols, test_df)
xgb_best_params, testing_df = xgb_evaluator.evaluate()

print("\nXGBoost:")
print("Best Parameters:", xgb_best_params)




XGBoost:
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}


In [3]:
testing_df.head()

Unnamed: 0,Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,...,N19,N20,N21,N22,N23,N24,N33,N34,N35,Class_1_Probability
0,Candidate_1602,1.0,0.0,0.0,23.0,0.0,1.0,0.0,1.0,18.0,...,10639.636706,21.797821,0.85539,0.909596,66.0,3333.333333,50.0,90.38,23.0,0.469903
1,Candidate_29650,1.0,0.0,2.0,4.0,2.0,1.0,2.0,1.0,16.75,...,12165.0,19.0,0.94,2.0,36.0,5779.833333,300.0,532.93,16.0,0.314305
2,Candidate_31061,1.0,2.0,3.0,38.0,1.0,0.0,4.0,1.0,29.99,...,504.0,34.0,0.7,2.0,48.0,3083.333333,80.0,169.78,22.0,0.353457
3,Candidate_5768,1.0,1.0,28.0,20.0,2.0,0.0,2.0,1.0,17.7,...,1428.0,9.0,0.77,0.0,36.0,5117.083333,150.0,270.02,13.0,0.181735
4,Candidate_27059,1.0,1.0,15.0,1.0,3.0,0.0,5.0,0.0,28.0,...,6324.0,25.0,0.92,2.0,57.714576,0.0,50.0,103.41,14.0,0.49141


In [9]:
grid_search

NameError: name 'grid_search' is not defined

In [11]:
testing_df.head()

Unnamed: 0,Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,...,N19,N20,N21,N22,N23,N24,N33,N34,N35,Class_1_Probability
0,Candidate_1602,1.0,0.0,0.0,23.0,0.0,1.0,0.0,1.0,18.0,...,10639.636706,21.797821,0.85539,0.909596,66.0,3333.333333,50.0,90.38,23.0,0.469903
1,Candidate_29650,1.0,0.0,2.0,4.0,2.0,1.0,2.0,1.0,16.75,...,12165.0,19.0,0.94,2.0,36.0,5779.833333,300.0,532.93,16.0,0.314305
2,Candidate_31061,1.0,2.0,3.0,38.0,1.0,0.0,4.0,1.0,29.99,...,504.0,34.0,0.7,2.0,48.0,3083.333333,80.0,169.78,22.0,0.353457
3,Candidate_5768,1.0,1.0,28.0,20.0,2.0,0.0,2.0,1.0,17.7,...,1428.0,9.0,0.77,0.0,36.0,5117.083333,150.0,270.02,13.0,0.181735
4,Candidate_27059,1.0,1.0,15.0,1.0,3.0,0.0,5.0,0.0,28.0,...,6324.0,25.0,0.92,2.0,57.714576,0.0,50.0,103.41,14.0,0.49141


In [12]:
final_prediction_df = testing_df[['Unique_ID', 'Class_1_Probability']]
# final_prediction_df.to_csv('../../../Test/final_predictions.csv', index=False)