In [4]:
# import pandas as pd
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import accuracy_score
# from sklearn.tree import DecisionTreeClassifier
# from xgboost import XGBClassifier
# from sklearn.ensemble import GradientBoostingClassifier

# # Load the dataset
# data = pd.read_csv("train.csv")
# test_df = pd.read_csv("test.csv")

# # Label encoding for categorical features
# categorical_features = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
# label_encoders = {}

# for col in categorical_features:
#     le = LabelEncoder()         #Need to change it to oneHotEncoding
#     data[col] = le.fit_transform(data[col])
#     test_df[col] = le.fit_transform(test_df[col])
#     label_encoders[col] = le  # Store the encoder for possible inverse transformation

# # Split data into features (X) and target (y)
# X = data.drop(columns=['LoanID', 'Default'])  # Drop ID column and target
# y = data['Default']
# undropped_X_test = test_df
# X_tests = test_df.drop(columns=['LoanID'], axis=1)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define a callback function to capture and log accuracies only
# def log_best_score(grid_search):
#     best_score = grid_search.best_score_
#     print(f"Best accuracy for model: {best_score:.4f}")


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Load the dataset
data = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Define the categorical features
categorical_features = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']

# Create a ColumnTransformer to apply OneHotEncoding to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'  # Keep non-categorical columns as they are
)

# Fit and transform the training data, and transform the test data
X = preprocessor.fit_transform(data.drop(columns=['LoanID', 'Default']))  # Drop ID column and target
X_tests = preprocessor.transform(test_df.drop(columns=['LoanID']))  # Drop ID column for the test set
y = data['Default']  # Target variable
undropped_X_test = test_df  # Keep the test dataframe without modifications for later use

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a callback function to capture and log accuracies only
def log_best_score(grid_search):
    best_score = grid_search.best_score_
    print(f"Best accuracy for model: {best_score:.4f}")


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Define hyperparameter grids for each model
dt_params = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

xgb_params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'n_estimators': [50, 100, 200]
}

gb_params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}

# Train and tune Decision Tree model
dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_params, scoring='accuracy', cv=5, n_jobs=-1, verbose=True)
dt_grid_search.fit(X_train, y_train)
log_best_score(dt_grid_search)

# Train and tune XGBoost model
xgb_grid_search = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), xgb_params, scoring='accuracy', cv=5, n_jobs=-1, verbose=True)
xgb_grid_search.fit(X_train, y_train)
log_best_score(xgb_grid_search)

# Train and tune Gradient Boosting model
gb_grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, scoring='accuracy', cv=5, n_jobs=-1, verbose=True)
gb_grid_search.fit(X_train, y_train)
log_best_score(gb_grid_search)

In [None]:
def generate_submission(predictions, df):
    submission = pd.DataFrame({'LoanID': df['LoanID'], 'Default': predictions})
    submission.to_csv("LendOrLose_submission.csv", index=False)

In [None]:
#X_test
undropped_X_test

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,CKV34LU7V7,55,112656,92393,581,113,2,23.54,36,0.15,3,2,2,1,1,3,0
1,62KTYNH93J,56,91569,131575,641,54,1,15.19,12,0.43,1,1,0,1,1,2,1
2,JGFUSOIUH7,26,78169,75417,569,105,3,18.02,12,0.29,2,1,1,1,1,2,1
3,4538THBHOX,26,63033,10804,326,118,1,14.71,24,0.41,1,1,2,0,0,1,1
4,DXLNA06JHR,24,29665,21182,662,102,3,15.02,60,0.69,3,3,2,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51065,DQRTA8KWGC,51,99473,170353,628,24,1,17.03,12,0.46,3,2,0,1,1,0,1
51066,W0FDMPACG3,29,42016,111314,371,51,4,7.10,36,0.50,3,2,1,0,0,4,0
51067,MA0F4U8ORY,67,88507,142666,731,51,1,22.89,48,0.79,0,1,0,0,0,2,0
51068,6QUH04P7EJ,42,116649,190938,488,6,1,10.83,60,0.32,0,0,1,0,1,4,1


In [None]:
xgb_tuned = xgb_grid_search.best_estimator_
xgb_prediction = xgb_tuned.predict(X=X_tests)
generate_submission(xgb_prediction, undropped_X_test)