In [None]:
# =======================
# 1. Importlar
# =======================
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# =======================
# 2. Logging sozlash
# =======================
log_file = r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Log\data_loader.log"

logging.basicConfig(
    filename=log_file,
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

logging.getLogger().addHandler(logging.StreamHandler())  # konsolga ham chiqadi

# =======================
# 3. Datasetni yuklash
# =======================
file_path = r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Data\Raw_Data\hotel_bookings_updated_2024.csv"
logging.info(f"Loading dataset from {file_path}")
df = pd.read_csv(file_path)
logging.info(f"Dataset loaded successfully with shape {df.shape}")

# =======================
# 4. Preprocessing klasslari
# =======================
class Cleaner:
    def __init__(self):
        self.fill_values = {}

    def fit(self, X):
        for col in X.columns:
            if X[col].dtype == 'object':
                self.fill_values[col] = X[col].mode()[0]
            else:
                self.fill_values[col] = X[col].median()
        return self

    def transform(self, X):
        X = X.copy()
        for col, value in self.fill_values.items():
            X[col] = X[col].fillna(value)
        return X

class Encoder:
    def __init__(self, max_unique=5):
        self.max_unique = max_unique
        self.cat_cols = None
        self.dummy_cols = {}

    def fit(self, X):
        self.cat_cols = X.select_dtypes(include='object').columns
        for col in self.cat_cols:
            if X[col].nunique() <= self.max_unique:
                self.dummy_cols[col] = X[col].unique().tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cat_cols:
            if col in self.dummy_cols:
                for val in self.dummy_cols[col]:
                    X[f"{col}_{val}"] = (X[col] == val).astype(int)
                X.drop(columns=[col], inplace=True)
            else:
                X[col] = X[col].astype('category').cat.codes
        return X

class Scaler:
    def __init__(self):
        self.scaler = StandardScaler()
        self.num_cols = None

    def fit(self, X):
        self.num_cols = X.select_dtypes(include=['int64', 'float64']).columns
        self.scaler.fit(X[self.num_cols])
        return self

    def transform(self, X):
        X = X.copy()
        X[self.num_cols] = self.scaler.transform(X[self.num_cols])
        return X

# =======================
# 5. Target va features ajratish
# =======================
y = df['is_canceled']
X = df.drop(columns=['is_canceled'])
logging.info(f"Target and features separated. Features shape: {X.shape}, Target shape: {y.shape}")

# =======================
# 6. Train-test split
# =======================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
logging.info(f"Train-test split done. X_train: {X_train.shape}, X_test: {X_test.shape}")

# =======================
# 7. Preprocessing
# =======================
cleaner = Cleaner().fit(X_train)
encoder = Encoder(max_unique=5).fit(X_train)
scaler = Scaler().fit(X_train)

X_train_scaled = scaler.transform(encoder.transform(cleaner.transform(X_train)))
X_test_scaled = scaler.transform(encoder.transform(cleaner.transform(X_test)))
logging.info("Preprocessing completed: Cleaning, Encoding, Scaling")

# =======================
# 8. Modellar
# =======================
models = { 
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(kernel='rbf', probability=True)
}

# =======================
# 9. Modellarni fit qilish va metrikalarni hisoblash
# =======================
results = []

for name, model in models.items():
    logging.info(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:,1] if hasattr(model, "predict_proba") else None
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    
    results.append({
        "Model": name,
        "Accuracy": round(acc,4),
        "Precision": round(prec,4),
        "Recall": round(rec,4),
        "F1-score": round(f1,4),
        "ROC AUC": round(roc_auc,4) if roc_auc is not None else "-"
    })
    logging.info(f"{name} metrics calculated.")

# =======================
# 10. Natijalarni DataFrame ko'rinishida chiqarish
# =======================
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1-score", ascending=False)
results_df.reset_index(drop=True, inplace=True)

logging.info("All models trained and metrics calculated.")
logging.info(f"\n{results_df}")

print(results_df)

Loading dataset from C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Data\Raw_Data\hotel_bookings_updated_2024.csv


Dataset loaded successfully with shape (119390, 33)
Target and features separated. Features shape: (119390, 32), Target shape: (119390,)
Train-test split done. X_train: (95512, 32), X_test: (23878, 32)
Preprocessing completed: Cleaning, Encoding, Scaling
Training Logistic Regression...
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Logistic Regression metrics calculated.
Training Decision Tree...
Decision Tree metrics calculated.
Training Random Forest...
Random Forest metrics calculated.
Training KNN...
KNN metrics calculated.
Training SVM...
