<a href="https://colab.research.google.com/github/somustafa/qss/blob/master/week6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
#Homework
# ==============================
# 1️⃣ Install Required Libraries
# ==============================
!pip install xgboost lightgbm catboost

# ==============================
# 2️⃣ Import Libraries
# ==============================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import VotingClassifier, BaggingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from google.colab import files

# ==============================
# 3️⃣ Upload and Read Dataset
# ==============================
uploaded = files.upload()
df = pd.read_csv("loan_prediction.csv")

# ==============================
# 4️⃣ Data Cleaning & Preprocessing
# ==============================

# Drop rows where target Loan_Status is missing
df = df.dropna(subset=['Loan_Status'])

# Fill missing values for categorical features
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

# Fill missing values for numerical features
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median(), inplace=True)

# Encode target variable
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

# Encode categorical features
cat_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Dependents']
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# ==============================
# 5️⃣ Train-Test Split
# ==============================
X = df.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==============================
# 6️⃣ Define Models
# ==============================
model1 = GradientBoostingClassifier(random_state=42)
model2 = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model3 = LGBMClassifier(random_state=42)
model4 = CatBoostClassifier(verbose=0, random_state=42)

voting = VotingClassifier(
    estimators=[('gb', model1), ('xgb', model2), ('lgbm', model3)],
    voting='soft'
)

bagging = BaggingClassifier(random_state=42)

models = {
    "VotingClassifier": voting,
    "BaggingClassifier": bagging,
    "GradientBoosting": model1,
    "XGBoost": model2,
    "LightGBM": model3,
    "CatBoost": model4
}

# ==============================
# 7️⃣ Train and Evaluate Models
# ==============================
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append({"Model": name, "Accuracy": acc})
    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    print()

# ==============================
# 8️⃣ Show Comparison Table
# ==============================
results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)
print("\nModel Performance Comparison:")
print(results_df)





[LightGBM] [Info] Number of positive: 342, number of negative: 149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 371
[LightGBM] [Info] Number of data points in the train set: 491, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.696538 -> initscore=0.830864
[LightGBM] [Info] Start training from score 0.830864
=== VotingClassifier ===
Accuracy: 0.7560975609756098
              precision    recall  f1-score   support

           0       0.76      0.44      0.56        43
           1       0.76      0.93      0.83        80

    accuracy                           0.76       123
   macro avg       0.76      0.68      0.70       123
weighted avg       0.76      0.76      0.74       123


=== BaggingClassifier ===
Accuracy: 0.7073170731707317
              

In [22]:
# 1. Lazımi kitabxanaları yükləyirik
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

# 2. Dataset-i yükləyirik
from google.colab import files

uploaded = files.upload()  # Burada faylı seçirsiniz
filename = list(uploaded.keys())[0]

# Faylı oxuyuruq və sütun adlarını təmizləyirik
df = pd.read_csv(filename, header=0)
df.columns = df.columns.str.strip()

# 3. X və Y bölünməsi (target sütun: "Bankrupt?")
X = df.drop(columns=["Bankrupt?"])
y = df["Bankrupt?"]

# 4. Train-Test bölməsi
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 5. Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 6. Modellerin siyahısı
models = {
    "Bagging": BaggingClassifier(
        estimator=DecisionTreeClassifier(),
        n_estimators=100,
        random_state=42
    ),
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42, eval_metric='logloss'
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42
    )
}

# 7. Nəticələri toplamaq üçün DataFrame
results = []

# 8. Modellerin təlimi və qiymətləndirilməsi
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None
    })

# 9. Nəticələrin cədvəli
results_df = pd.DataFrame(results)
print(results_df)


Saving data.csv to data (5).csv
[LightGBM] [Info] Number of positive: 176, number of negative: 5279
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23713
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.032264 -> initscore=-3.401008
[LightGBM] [Info] Start training from score -3.401008
               Model  Accuracy  Precision    Recall  F1 Score   ROC-AUC
0            Bagging  0.968475   0.529412  0.204545  0.295082  0.903564
1  Gradient Boosting  0.966276   0.464286  0.295455  0.361111  0.939032
2            XGBoost  0.969941   0.560000  0.318182  0.405797  0.947469
3           LightGBM  0.972874   0.666667  0.318182  0.430769  0.952996
