In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
data = pd.read_csv("C:/Users/agshi/Desktop/BOOTCAMP/Week 6/Python/loan_prediction.csv")

# Step 2: Preprocess the data
# Handle missing values, encode categorical variables, scale numerical features, etc.
label_encoders = {}
for col in data.columns:
    if data[col].dtype == 'object':
        label_encoders[col] = LabelEncoder()
        data[col] = label_encoders[col].fit_transform(data[col])

# Step 3: Split the data into features and target variable
X = data.drop(columns=['Loan_Status', 'Loan_ID'])  # Features
y = data['Loan_Status']  # Target variable



# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), 
])

X_train_preprocessed = pipeline.fit_transform(X_train)
X_test_preprocessed = pipeline.transform(X_test)

# Step 5: Instantiate and train the models
# Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('bagging', BaggingClassifier()),
    ('gradient_boosting', GradientBoostingClassifier()),
    ('xgboost', XGBClassifier()),
    ('lightgbm', LGBMClassifier()),
    ('catboost', CatBoostClassifier(verbose=False))  
], voting='hard')

voting_clf.fit(X_train, y_train)

# Individual models
bagging_clf = BaggingClassifier()
bagging_clf.fit(X_train, y_train)

gradient_boosting_clf = GradientBoostingClassifier()
gradient_boosting_clf.fit(X_train, y_train)

xgboost_clf = XGBClassifier()
xgboost_clf.fit(X_train, y_train)

lightgbm_clf = LGBMClassifier()
lightgbm_clf.fit(X_train, y_train)

catboost_clf = CatBoostClassifier(verbose=False)
catboost_clf.fit(X_train, y_train)

# Step 6: Evaluate the models
# Voting Classifier
voting_pred = voting_clf.predict(X_test)
voting_accuracy = accuracy_score(y_test, voting_pred)
print("Voting Classifier Accuracy:", voting_accuracy)

# Individual models
bagging_pred = bagging_clf.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_pred)
print("Bagging Classifier Accuracy:", bagging_accuracy)

gradient_boosting_pred = gradient_boosting_clf.predict(X_test)
gradient_boosting_accuracy = accuracy_score(y_test, gradient_boosting_pred)
print("Gradient Boosting Classifier Accuracy:", gradient_boosting_accuracy)

xgboost_pred = xgboost_clf.predict(X_test)
xgboost_accuracy = accuracy_score(y_test, xgboost_pred)
print("XGBoost Classifier Accuracy:", xgboost_accuracy)

lightgbm_pred = lightgbm_clf.predict(X_test)
lightgbm_accuracy = accuracy_score(y_test, lightgbm_pred)
print("LightGBM Classifier Accuracy:", lightgbm_accuracy)

catboost_pred = catboost_clf.predict(X_test)
catboost_accuracy = accuracy_score(y_test, catboost_pred)
print("CatBoost Classifier Accuracy:", catboost_accuracy)


ValueError: Input X contains NaN.
DecisionTreeClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values