In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, confusion_matrix


In [19]:
df = pd.read_csv("train_u6lujuX_CVtuZ9i.csv")
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [20]:
target = "Loan_Status"

categorical_features = [
    "Gender", "Married", "Dependents",
    "Education", "Self_Employed", "Property_Area"
]

numerical_features = [
    "ApplicantIncome", "CoapplicantIncome",
    "LoanAmount", "Loan_Amount_Term", "Credit_History"
]


In [21]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numerical_features),
    ("cat", categorical_pipeline, categorical_features)
])


In [22]:
X = df[numerical_features + categorical_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [23]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier()
}

base_results = {}
trained_models = {}


In [24]:
for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    base_results[name] = acc
    trained_models[name] = pipe
    
    print(f"{name} Accuracy: {acc:.4f}")


Logistic Regression Accuracy: 0.8618
Decision Tree Accuracy: 0.7561
Random Forest Accuracy: 0.8211
KNN Accuracy: 0.8699


In [25]:
best_model = max(base_results, key=base_results.get)
best_model, base_results[best_model]


('KNN', 0.8699186991869918)

In [26]:
# Generate meta-features
train_meta = np.column_stack([
    trained_models["Logistic Regression"].predict(X_train),
    trained_models["Decision Tree"].predict(X_train),
    trained_models["Random Forest"].predict(X_train)
])

test_meta = np.column_stack([
    trained_models["Logistic Regression"].predict(X_test),
    trained_models["Decision Tree"].predict(X_test),
    trained_models["Random Forest"].predict(X_test)
])

meta_model = LogisticRegression()
meta_model.fit(train_meta, y_train)

stack_pred = meta_model.predict(test_meta)


ValueError: could not convert string to float: 'Y'

In [10]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

lr_pred = lr.predict(X_test_scaled)
lr_acc = accuracy_score(y_test, lr_pred)

lr_acc


0.8617886178861789

In [11]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

dt_pred = dt.predict(X_test)
dt_acc = accuracy_score(y_test, dt_pred)

dt_acc


0.7642276422764228

In [12]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

rf_acc


0.8292682926829268

In [13]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

knn_pred = knn.predict(X_test_scaled)
knn_acc = accuracy_score(y_test, knn_pred)

knn_acc


0.8455284552845529

In [14]:
train_meta = np.column_stack([
    lr.predict(X_train_scaled),
    dt.predict(X_train),
    rf.predict(X_train)
])

test_meta = np.column_stack([
    lr.predict(X_test_scaled),
    dt.predict(X_test),
    rf.predict(X_test)
])


In [15]:
meta_model = LogisticRegression()
meta_model.fit(train_meta, y_train)

stack_pred = meta_model.predict(test_meta)
stack_acc = accuracy_score(y_test, stack_pred)

stack_acc


0.8536585365853658

In [16]:
print("Random Forest Accuracy:", rf_acc)
print("Stacking Model Accuracy:", stack_acc)


Random Forest Accuracy: 0.8292682926829268
Stacking Model Accuracy: 0.8536585365853658


In [17]:
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))

print("\nStacking Model Confusion Matrix:")
print(confusion_matrix(y_test, stack_pred))


Random Forest Confusion Matrix:
[[24 14]
 [ 7 78]]

Stacking Model Confusion Matrix:
[[24 14]
 [ 4 81]]
