In [191]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

In [192]:
# Load data
train_data = pd.read_csv('C:\\Users\\DELL\\Downloads\\iml-fall-2024-challenge-1\\train_set.csv')
test_data = pd.read_csv('C:\\Users\\DELL\\Downloads\\iml-fall-2024-challenge-1\\test_set.csv')


In [193]:
# Separate features and target
X_train = train_data.drop(columns=['RecordId', 'Y'])
y_train = train_data['Y']
X_test = test_data.drop(columns=['RecordId'])

In [194]:
# Data Cleaning: Simple Imputation using mean strategy
imputer = SimpleImputer(strategy='mean')  # Using mean to fill missing values
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [195]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [196]:
# Model: XGBoost with improved parameters
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_estimators=200,           # Number of boosting rounds
    learning_rate=0.1,          # Step size shrinkage
    max_depth=5,                # Maximum depth of the tree
    min_child_weight=1,         # Minimum sum of instance weight (hessian) needed in a child
    gamma=0,                    # Minimum loss reduction required to make a further partition
    subsample=0.8,              # Subsample ratio of the training instances
    colsample_bytree=0.8,       # Subsample ratio of columns when constructing each tree
    scale_pos_weight=1          # Control the balance of positive and negative weights
)

In [197]:
# Best model from grid search
best_xgb_model = xgb_model

In [198]:
# Cross-validation
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=5, scoring='roc_auc', n_jobs=-1)


In [199]:
# Train the model on the full training set
xgb_model.fit(X_train_scaled, y_train)

Parameters: { "use_label_encoder" } are not used.



In [200]:
# Prediction on test data
test_prob = xgb_model.predict_proba(X_test_scaled)[:, 1]
test_ids = test_data['RecordId']
submission = pd.DataFrame({'RecordId': test_ids, 'Y': test_prob})

In [201]:
# Output Cross-Validation Results
print("Cross-Validation AUC Scores:", cv_scores.mean())
print("Standard Deviation in CV Scores:", cv_scores.std())

Cross-Validation AUC Scores: 0.9593681718441968
Standard Deviation in CV Scores: 0.0077064413544351545


In [202]:
# Save submission file
submission.to_csv('submission.csv', index=False)
