In [1]:
import pandas as pd
import logging
import os


log_path = r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Log\data_loader.log"
os.makedirs(os.path.dirname(log_path), exist_ok=True)


logging.basicConfig(
    filename=log_path,
    filemode="a",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO
)

logging.info("===== PREPROCESSED DATA LOADER BOSHLANDI =====")


BASE_PATH = r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Data\Preprosessed"

PATHS = {
    "X_train": "X_train_preprocessed.csv",
    "X_test":  "X_test_preprocessed.csv",
    "y_train": "y_train.csv",
    "y_test":  "y_test.csv"
}


try:
    X_train = pd.read_csv(os.path.join(BASE_PATH, PATHS["X_train"]))
    X_test  = pd.read_csv(os.path.join(BASE_PATH, PATHS["X_test"]))
    y_train = pd.read_csv(os.path.join(BASE_PATH, PATHS["y_train"]))
    y_test  = pd.read_csv(os.path.join(BASE_PATH, PATHS["y_test"]))

    logging.info("Preprocessed datasetlar muvaffaqiyatli yuklandi")
    logging.info(f"X_train shape: {X_train.shape}")
    logging.info(f"X_test  shape: {X_test.shape}")
    logging.info(f"y_train shape: {y_train.shape}")
    logging.info(f"y_test  shape: {y_test.shape}")

except Exception as e:
    logging.error(f"Datasetlarni yuklashda xatolik: {e}")
    raise


if X_train.shape[0] != y_train.shape[0]:
    logging.error("X_train va y_train satr soni mos emas")
    raise ValueError("Train set mismatch")

if X_test.shape[0] != y_test.shape[0]:
    logging.error("X_test va y_test satr soni mos emas")
    raise ValueError("Test set mismatch")

# Target leakage tekshiruvi
if set(y_train.columns) & set(X_train.columns):
    logging.error("Target X_train ichiga kirib ketgan!")
    raise ValueError("Target leakage detected")

logging.info("DLP tekshiruvlar muvaffaqiyatli oâ€˜tdi")
logging.info("===== DATA LOADER YAKUNLANDI =====")

In [4]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

# =========================
# DATA LOAD
# =========================
X_train = pd.read_csv(
    r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Data\Preprosessed\X_train_preprocessed.csv"
)
X_test = pd.read_csv(
    r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Data\Preprosessed\X_test_preprocessed.csv"
)
y_train = pd.read_csv(
    r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Data\Preprosessed\y_train.csv"
).values.ravel()
y_test = pd.read_csv(
    r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Data\Preprosessed\y_test.csv"
).values.ravel()

# =========================
# FEATURE TYPES
# =========================
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

# =========================
# PREPROCESSING
# =========================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler(with_mean=False))  # sparse uchun
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


# =========================
# MODELS (5 TA ODDIY)
# =========================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "KNN": KNeighborsClassifier()
}


# =========================
# TRAIN & EVALUATE
# =========================
results = []

for name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)

    # ROC-AUC uchun ehtimollik
    if hasattr(pipe, "predict_proba"):
        y_prob = pipe.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_prob)
    else:
        roc_auc = None

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc
    })

# =========================
# RESULTS
# =========================
results_df = pd.DataFrame(results)
print(results_df)

                 Model  Accuracy  Precision    Recall  F1-score  ROC-AUC
0  Logistic Regression  1.000000   1.000000  1.000000  1.000000  1.00000
1        Decision Tree  1.000000   1.000000  1.000000  1.000000  1.00000
2        Random Forest  0.999916   1.000000  0.999777  0.999889  1.00000
3                  KNN  0.974035   0.984903  0.945380  0.964737  0.99348
