In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb

# Load the datasets
train_data = pd.read_csv('/content/drive/My Drive/Hackathon/Train_set.csv')
test_data = pd.read_csv('/content/drive/My Drive/Hackathon/Test_set.csv', names=[
    "ID", "loan_amnt", "loan_term", "interest_rate", "loan_grade", "loan_subgrade",
    "job_experience", "home_ownership", "annual_income", "income_verification_status",
    "loan_purpose", "state_code", "debt_to_income", "delinq_2yrs", "public_records",
    "revolving_balance", "total_acc", "interest_receive", "application_type",
    "last_week_pay", "total_current_balance", "total_revolving_limit"
], low_memory=False)

# Ensure numeric columns are converted properly
numeric_columns = ['loan_amnt', 'interest_rate', 'annual_income', 'debt_to_income',
                   'delinq_2yrs', 'public_records', 'revolving_balance', 'total_acc',
                   'interest_receive', 'last_week_pay', 'total_current_balance',
                   'total_revolving_limit']

# Convert numeric columns (coercing invalid values to NaN)
for col in numeric_columns:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

# Separate features and target in train_data
X = train_data.drop(['default', 'ID'], axis=1)
y = train_data['default']

# Store the original ID column from the test data
test_ids = test_data['ID']

# Drop the ID column from test_data
test_data = test_data.drop('ID', axis=1)

# Identify categorical columns and numerical columns
categorical_columns = ['loan_term', 'loan_grade', 'loan_subgrade', 'job_experience',
                       'home_ownership', 'income_verification_status', 'loan_purpose',
                       'state_code', 'application_type']

# Label encode 'loan_grade' and 'loan_subgrade'
label_encoders = {}
for col in ['loan_grade', 'loan_subgrade']:
    le = LabelEncoder()
    all_values = pd.concat([X[col], test_data[col]]).unique()
    le.fit(all_values)

    X[col] = le.transform(X[col])
    test_data[col] = le.transform(test_data[col])
    label_encoders[col] = le

# OneHotEncode other categorical columns
X = pd.get_dummies(X, columns=[col for col in categorical_columns if col not in ['loan_grade', 'loan_subgrade']])
test_data = pd.get_dummies(test_data, columns=[col for col in categorical_columns if col not in ['loan_grade', 'loan_subgrade']])

# Align train and test datasets to ensure the same number of columns
X, test_data = X.align(test_data, join='left', axis=1, fill_value=0)

# Fix column names to remove special characters
X.columns = X.columns.str.replace(r'[\[\]<]', '', regex=True)
test_data.columns = test_data.columns.str.replace(r'[\[\]<]', '', regex=True)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])
test_data[numeric_columns] = scaler.transform(test_data[numeric_columns])

# Define XGBoost model and hyperparameter grid
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# GridSearchCV for XGBoost
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Predictions and evaluation
y_val_pred = best_xgb_model.predict(X_val)

print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print("Classification Report:")
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# Predict on test data
test_predictions = best_xgb_model.predict(test_data)

# Prepare the submission file with the correct ID column
submission = pd.DataFrame({
    'ID': test_ids,  # Use the original test IDs
    'default': test_predictions
})

# Save submission file
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")


Fitting 3 folds for each of 48 candidates, totalling 144 fits


Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 0.8738932116984169
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92     14083
           1       0.83      0.61      0.70      4552

    accuracy                           0.87     18635
   macro avg       0.86      0.78      0.81     18635
weighted avg       0.87      0.87      0.87     18635

Confusion Matrix:
[[13516   567]
 [ 1783  2769]]
Submission file created successfully.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb

# Load the datasets
train_data = pd.read_csv('/content/drive/My Drive/Hackathon/Train_set.csv')
test_data = pd.read_csv('/content/drive/My Drive/Hackathon/Test_set.csv', names=[
    "ID", "loan_amnt", "loan_term", "interest_rate", "loan_grade", "loan_subgrade",
    "job_experience", "home_ownership", "annual_income", "income_verification_status",
    "loan_purpose", "state_code", "debt_to_income", "delinq_2yrs", "public_records",
    "revolving_balance", "total_acc", "interest_receive", "application_type",
    "last_week_pay", "total_current_balance", "total_revolving_limit"
], low_memory=False)

# Ensure numeric columns are converted properly
numeric_columns = ['loan_amnt', 'interest_rate', 'annual_income', 'debt_to_income',
                   'delinq_2yrs', 'public_records', 'revolving_balance', 'total_acc',
                   'interest_receive', 'last_week_pay', 'total_current_balance',
                   'total_revolving_limit']

# Convert numeric columns (coercing invalid values to NaN)
for col in numeric_columns:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

# Feature Engineering: Adding a few interaction features
train_data['loan_income_ratio'] = train_data['loan_amnt'] / train_data['annual_income']
test_data['loan_income_ratio'] = test_data['loan_amnt'] / test_data['annual_income']

# Separate features and target in train_data
X = train_data.drop(['default', 'ID'], axis=1)
y = train_data['default']

# Store the original ID column from the test data
test_ids = test_data['ID']

# Drop the ID column from test_data
test_data = test_data.drop('ID', axis=1)

# Identify categorical columns
categorical_columns = ['loan_term', 'loan_grade', 'loan_subgrade', 'job_experience',
                       'home_ownership', 'income_verification_status', 'loan_purpose',
                       'state_code', 'application_type']

# Label encode 'loan_grade' and 'loan_subgrade'
label_encoders = {}
for col in ['loan_grade', 'loan_subgrade']:
    le = LabelEncoder()
    all_values = pd.concat([X[col], test_data[col]]).unique()
    le.fit(all_values)

    X[col] = le.transform(X[col])
    test_data[col] = le.transform(test_data[col])
    label_encoders[col] = le

# OneHotEncode other categorical columns
X = pd.get_dummies(X, columns=[col for col in categorical_columns if col not in ['loan_grade', 'loan_subgrade']])
test_data = pd.get_dummies(test_data, columns=[col for col in categorical_columns if col not in ['loan_grade', 'loan_subgrade']])

# Align train and test datasets to ensure the same number of columns
X, test_data = X.align(test_data, join='left', axis=1, fill_value=0)

# Fix column names to remove special characters
X.columns = X.columns.str.replace(r'[\[\]<]', '', regex=True)
test_data.columns = test_data.columns.str.replace(r'[\[\]<]', '', regex=True)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])
test_data[numeric_columns] = scaler.transform(test_data[numeric_columns])

# Define XGBoost model and extended hyperparameter grid
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.01, 0.1],  # L1 regularization
    'reg_lambda': [1, 1.5, 2],  # L2 regularization
    'scale_pos_weight': [1, 2, 3]  # Adjust for class imbalance
}

# GridSearchCV for XGBoost with 5-fold CV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model and train with early stopping
best_xgb_model = grid_search.best_estimator_
best_xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=True)

# Predictions and evaluation
y_val_pred = best_xgb_model.predict(X_val)

print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print("Classification Report:")
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# Predict on test data
test_predictions = best_xgb_model.predict(test_data)

# Prepare the submission file with the correct ID column
submission = pd.DataFrame({
    'ID': test_ids,  # Use the original test IDs
    'default': test_predictions
})

# Save submission file
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")


Fitting 5 folds for each of 78732 candidates, totalling 393660 fits
