In [3]:
import pandas as pd
import sys, os, logging
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import joblib
import plotly.graph_objects as go
import optuna

# Source papkasini qo'shish
source_path = os.path.abspath("../Source")
if source_path not in sys.path:
    sys.path.append(source_path)
from preprocessing import Cleaner, Encoder, Scaler

# Logging sozlamalari
log_path = r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Log\data_loader.log"
logging.basicConfig(filename=log_path, filemode='a',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    level=logging.INFO)

# CSV faylni o'qish
csv_path = r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Data\Raw_Data\hotel_bookings_updated_2024.csv"
df = pd.read_csv(csv_path)
logging.info(f"Fayl o'qildi: {len(df)} satr, {len(df.columns)} ustun")

# Target va features
y = df['is_canceled']
X = df.drop(columns=['is_canceled'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocessing
cleaner = Cleaner(); cleaner.fit(X_train)
X_train_clean = cleaner.transform(X_train)
X_test_clean = cleaner.transform(X_test)

encoder = Encoder(max_unique=5); encoder.fit(X_train_clean)
X_train_enc = encoder.transform(X_train_clean)
X_test_enc = encoder.transform(X_test_clean)

scaler = Scaler(); scaler.fit(X_train_enc)
X_train_final = scaler.transform(X_train_enc)
X_test_final = scaler.transform(X_test_enc)

# SMOTE bilan balanslash
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_final, y_train)

# --- 1️⃣ Grid Search Hyperparameter ---
lr = LogisticRegression(max_iter=1000)
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
knn = KNeighborsClassifier()

param_grid = {
    'Logistic Regression': {'C':[0.01,0.1,1,10], 'penalty':['l2']},
    'Decision Tree': {'max_depth':[3,5,10,None], 'min_samples_split':[2,5,10]},
    'Random Forest': {'n_estimators':[50,100,200], 'max_depth':[5,10,None]},
    'KNN': {'n_neighbors':[3,5,7,10]}
}

grid_models = {'Logistic Regression': lr, 'Decision Tree': dt, 'Random Forest': rf, 'KNN': knn}

grid_results = []

for name, model in grid_models.items():
    grid = GridSearchCV(model, param_grid[name], scoring='f1', cv=5, n_jobs=-1)
    grid.fit(X_train_bal, y_train_bal)
    y_pred = grid.predict(X_test_final)
    grid_results.append({
        "Model": name,
        "Best_Params": grid.best_params_,
        "Accuracy": round(accuracy_score(y_test, y_pred),4),
        "Precision": round(precision_score(y_test, y_pred),4),
        "Recall": round(recall_score(y_test, y_pred),4),
        "F1-Score": round(f1_score(y_test, y_pred),4)
    })

grid_df = pd.DataFrame(grid_results)
print("Grid Search Results:")
print(grid_df)

# --- 2️⃣ Random Search Hyperparameter ---
from scipy.stats import randint

param_dist = {
    'Logistic Regression': {'C':[0.01,0.1,1,10]},
    'Decision Tree': {'max_depth':[3,5,10,None], 'min_samples_split':[2,5,10]},
    'Random Forest': {'n_estimators':[50,100,200], 'max_depth':[5,10,None]},
    'KNN': {'n_neighbors':[3,5,7,10]}
}

random_results = []

for name, model in grid_models.items():
    rand = RandomizedSearchCV(model, param_distributions=param_dist[name], n_iter=5, scoring='f1', cv=5, n_jobs=-1, random_state=42)
    rand.fit(X_train_bal, y_train_bal)
    y_pred = rand.predict(X_test_final)
    random_results.append({
        "Model": name,
        "Best_Params": rand.best_params_,
        "Accuracy": round(accuracy_score(y_test, y_pred),4),
        "Precision": round(precision_score(y_test, y_pred),4),
        "Recall": round(recall_score(y_test, y_pred),4),
        "F1-Score": round(f1_score(y_test, y_pred),4)
    })

random_df = pd.DataFrame(random_results)
print("Random Search Results:")
print(random_df)

# --- 3️⃣ Optuna Bayesian Hyperparameter Optimization ---
optuna_results = []

def objective(trial, model_name):
    if model_name=='Logistic Regression':
        C = trial.suggest_float('C', 0.01, 10.0, log=True)
        model = LogisticRegression(C=C, max_iter=1000)
    elif model_name=='Decision Tree':
        max_depth = trial.suggest_int('max_depth', 2, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    elif model_name=='Random Forest':
        n_estimators = trial.suggest_int('n_estimators',50,200)
        max_depth = trial.suggest_int('max_depth',5,20)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    elif model_name=='KNN':
        n_neighbors = trial.suggest_int('n_neighbors',3,15)
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
    else:
        return 0
    
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test_final)
    return f1_score(y_test, y_pred)

for name in grid_models.keys():
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, name), n_trials=10)
    best_params = study.best_params
    # Train best model
    if name=='Logistic Regression':
        best_model = LogisticRegression(C=best_params['C'], max_iter=1000)
    elif name=='Decision Tree':
        best_model = DecisionTreeClassifier(max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], random_state=42)
    elif name=='Random Forest':
        best_model = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], random_state=42)
    elif name=='KNN':
        best_model = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'])
    
    best_model.fit(X_train_bal, y_train_bal)
    y_pred = best_model.predict(X_test_final)
    optuna_results.append({
        "Model": name,
        "Best_Params": best_params,
        "Accuracy": round(accuracy_score(y_test, y_pred),4),
        "Precision": round(precision_score(y_test, y_pred),4),
        "Recall": round(recall_score(y_test, y_pred),4),
        "F1-Score": round(f1_score(y_test, y_pred),4)
    })

optuna_df = pd.DataFrame(optuna_results)
print("Optuna Results:")
print(optuna_df)

# --- F1 bo'yicha eng yaxshi modelni saqlash ---
all_results = pd.concat([grid_df, random_df, optuna_df])
best_idx = all_results['F1-Score'].idxmax()
best_model_info = all_results.loc[best_idx]
best_model_name = str(best_model_info['Model'])
# save path
save_dir = r"C:\Users\Rasulbek907\Desktop\Hotel Booking Cancellation Prediction\Models"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, f"{best_model_name.replace(' ','_')}_Tuning_model.pkl")

# joblib bilan saqlash
joblib.dump(best_model, save_path)
print(f"Eng yaxshi model: {best_model_name}, saqlandi: {save_path}")

# Plotly jadval
colors = []
for i, row in all_results.iterrows():
    row_colors = []
    for metric in ['Accuracy','Precision','Recall','F1-Score']:
        if row[metric]>=0.8:
            row_colors.append('lightgreen')
        elif row[metric]<0.6:
            row_colors.append('lightcoral')
        else:
            row_colors.append('white')
    colors.append(['white'] + row_colors)

fig = go.Figure(data=[go.Table(
    header=dict(values=list(all_results.columns), fill_color='paleturquoise', align='center'),
    cells=dict(values=[all_results[col] for col in all_results.columns], fill_color=colors, align='center'))
])
fig.show()

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Grid Search Results:
                 Model                               Best_Params  Accuracy  \
0  Logistic Regression              {'C': 0.01, 'penalty': 'l2'}    1.0000   
1        Decision Tree  {'max_depth': 3, 'min_samples_split': 2}    1.0000   
2        Random Forest      {'max_depth': 5, 'n_estimators': 50}    1.0000   
3                  KNN                        {'n_neighbors': 3}    0.5824   

   Precision  Recall  F1-Score  
0     1.0000  1.0000    1.0000  
1     1.0000  1.0000    1.0000  
2     1.0000  1.0000    1.0000  
3     0.4203  0.3361    0.3735  


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2025-12-20 12:23:27,710] A new study created in memory with name: no-name-14a63157-075a-4402-b149-3b72de280182


Random Search Results:
                 Model                                  Best_Params  Accuracy  \
0  Logistic Regression                                  {'C': 0.01}    1.0000   
1        Decision Tree  {'min_samples_split': 5, 'max_depth': None}    1.0000   
2        Random Forest     {'n_estimators': 100, 'max_depth': None}    1.0000   
3                  KNN                           {'n_neighbors': 3}    0.5824   

   Precision  Recall  F1-Score  
0     1.0000  1.0000    1.0000  
1     1.0000  1.0000    1.0000  
2     1.0000  1.0000    1.0000  
3     0.4203  0.3361    0.3735  


[I 2025-12-20 12:23:34,178] Trial 0 finished with value: 1.0 and parameters: {'C': 1.3850598112819665}. Best is trial 0 with value: 1.0.
[I 2025-12-20 12:23:40,475] Trial 1 finished with value: 1.0 and parameters: {'C': 0.7198478516676091}. Best is trial 0 with value: 1.0.
[I 2025-12-20 12:23:45,839] Trial 2 finished with value: 1.0 and parameters: {'C': 0.926200834625734}. Best is trial 0 with value: 1.0.
[I 2025-12-20 12:24:13,564] Trial 3 finished with value: 1.0 and parameters: {'C': 0.0926445595422713}. Best is trial 0 with value: 1.0.
[I 2025-12-20 12:24:28,600] Trial 4 finished with value: 1.0 and parameters: {'C': 0.37571782878234855}. Best is trial 0 with value: 1.0.
[I 2025-12-20 12:25:01,722] Trial 5 finished with value: 1.0 and parameters: {'C': 0.02597890475663539}. Best is trial 0 with value: 1.0.
[I 2025-12-20 12:25:27,490] Trial 6 finished with value: 1.0 and parameters: {'C': 0.09786010867331406}. Best is trial 0 with value: 1.0.
[I 2025-12-20 12:25:43,326] Trial 7 fin

Optuna Results:
                 Model                                Best_Params  Accuracy  \
0  Logistic Regression                  {'C': 1.3850598112819665}    1.0000   
1        Decision Tree  {'max_depth': 18, 'min_samples_split': 6}    1.0000   
2        Random Forest     {'n_estimators': 158, 'max_depth': 10}    1.0000   
3                  KNN                        {'n_neighbors': 15}    0.6099   

   Precision  Recall  F1-Score  
0     1.0000  1.0000    1.0000  
1     1.0000  1.0000    1.0000  
2     1.0000  1.0000    1.0000  
3     0.4619  0.3223    0.3797  


OSError: [Errno 22] Invalid argument: 'C:\\Users\\Rasulbek907\\Desktop\\Hotel Booking Cancellation Prediction\\Models\\0____Logistic_Regression\n0____Logistic_Regression\n0____Logistic_Regression\nName:_Model,_dtype:_object_Tuning_model.pkl'