In [1]:
import pandas as pd

df = pd.read_csv("../data/final_preprocessed.csv")
df.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,row_id
0,Resort Hotel,0,122,2017,February,6,10,1,2,2,...,241.0,,0,Transient,42.81,0,1,Check-Out,2017-02-13,2
1,Resort Hotel,0,32,2017,February,6,8,1,4,2,...,240.0,,0,Transient-Party,68.0,0,2,Check-Out,2017-02-13,3
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,,,0,Transient,75.0,0,0,Check-Out,2015-07-02,5
3,Resort Hotel,0,22,2017,February,7,12,1,0,2,...,242.0,,0,Transient,73.06,0,1,Check-Out,2017-02-13,6
4,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02,8


In [2]:
df.shape


(85369, 33)

In [3]:
df["is_canceled"].value_counts(normalize=True)


is_canceled
0    0.721714
1    0.278286
Name: proportion, dtype: float64

In [4]:
df.isna().sum().sort_values(ascending=False).head(15)


company                        80498
agent                          11288
country                          436
hotel                              0
reserved_room_type                 0
assigned_room_type                 0
booking_changes                    0
deposit_type                       0
days_in_waiting_list               0
previous_cancellations             0
customer_type                      0
adr                                0
required_car_parking_spaces        0
total_of_special_requests          0
reservation_status                 0
dtype: int64

In [5]:
target = "is_canceled"
X = df.drop(columns=[target])
y = df[target]


In [6]:
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

cat_cols, num_cols[:10], len(cat_cols), len(num_cols)


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


(['hotel',
  'arrival_date_month',
  'meal',
  'country',
  'market_segment',
  'distribution_channel',
  'reserved_room_type',
  'assigned_room_type',
  'deposit_type',
  'customer_type',
  'reservation_status',
  'reservation_status_date'],
 ['lead_time',
  'arrival_date_year',
  'arrival_date_week_number',
  'arrival_date_day_of_month',
  'stays_in_weekend_nights',
  'stays_in_week_nights',
  'adults',
  'children',
  'babies',
  'is_repeated_guest'],
 12,
 20)

In [10]:
leak_cols = [
    "reservation_status",
    "reservation_status_date",
]

X = df.drop(columns=[target] + [c for c in leak_cols if c in df.columns])
y = df[target]


In [11]:
post_booking_cols = [
    "assigned_room_type",
    "booking_changes",
]
X = X.drop(columns=[c for c in post_booking_cols if c in X.columns])


In [12]:
cat_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer

cat_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False))  # 희소행렬 호환
])

preprocess = ColumnTransformer(
    transformers=[
        ("cat", cat_pipe, cat_cols),
        ("num", num_pipe, num_cols),
    ]
)

model = LogisticRegression(
    max_iter=2000,
    solver="saga",          # 희소 + 대규모에 강함
    n_jobs=-1
)

pipe = Pipeline(steps=[("preprocess", preprocess), ("model", model)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:, 1]

print("ROC-AUC:", roc_auc_score(y_test, proba))
print(classification_report(y_test, pred))




ROC-AUC: 0.8201679883124842
              precision    recall  f1-score   support

           0       0.79      0.95      0.86     12323
           1       0.72      0.34      0.46      4751

    accuracy                           0.78     17074
   macro avg       0.75      0.64      0.66     17074
weighted avg       0.77      0.78      0.75     17074





In [15]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

thresholds = [0.2, 0.3, 0.4, 0.5]
for t in thresholds:
    pred_t = (proba >= t).astype(int)
    p, r, f, _ = precision_recall_fscore_support(y_test, pred_t, average="binary")
    print(f"t={t:.1f}  precision={p:.3f}  recall={r:.3f}  f1={f:.3f}")


t=0.2  precision=0.423  recall=0.903  f1=0.576
t=0.3  precision=0.520  recall=0.735  f1=0.609
t=0.4  precision=0.627  recall=0.548  f1=0.585
t=0.5  precision=0.715  recall=0.341  f1=0.462


In [16]:
final_threshold = 0.3

df["risk_score"] = pipe.predict_proba(X)[:, 1]
df["risk_group"] = pd.cut(
    df["risk_score"],
    bins=[0, 0.2, 0.3, 1.0],
    labels=["Low", "Medium", "High"]
)

df[["risk_score", "risk_group"]].value_counts()


risk_score  risk_group
0.246364    Medium        1
0.075972    Low           1
0.078711    Low           1
0.240510    Medium        1
0.104238    Low           1
                         ..
0.010015    High          0
0.241638    Low           0
            High          0
0.413714    Medium        0
            Low           0
Name: count, Length: 256107, dtype: int64

In [17]:
# 전처리 객체 가져오기
preprocessor = pipe.named_steps["preprocess"]

# 범주형 피처 이름
cat_feature_names = (
    preprocessor.named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out(cat_cols)
)

# 수치형 피처 이름
num_feature_names = num_cols

# 전체 피처 이름
feature_names = list(cat_feature_names) + list(num_feature_names)

len(feature_names)


238

In [18]:
import pandas as pd
import numpy as np

coef = pipe.named_steps["model"].coef_[0]

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coef": coef
})

coef_df["abs_coef"] = coef_df["coef"].abs()
coef_df = coef_df.sort_values("abs_coef", ascending=False)


In [19]:
coef_df[coef_df["coef"] > 0].head(10)


Unnamed: 0,feature,coef,abs_coef
152,country_PRT,0.732526,0.732526
197,market_segment_Online TA,0.609736,0.609736
219,lead_time,0.435464,0.435464
217,customer_type_Transient,0.362076,0.362076
229,previous_cancellations,0.308095,0.308095
234,adr,0.265681,0.265681
201,distribution_channel_TA/TO,0.215967,0.215967
213,deposit_type_Non Refund,0.172268,0.172268
17,meal_SC,0.140193,0.140193
0,hotel_City Hotel,0.137433,0.137433


In [20]:
coef_df[coef_df["coef"] < 0].tail(10)


Unnamed: 0,feature,coef,abs_coef
180,country_UGA,-7.1e-05,7.1e-05
160,country_SDN,-7e-05,7e-05
171,country_SYC,-6.8e-05,6.8e-05
33,country_BDI,-6.4e-05,6.4e-05
59,country_CYM,-6.3e-05,6.3e-05
151,country_PRI,-6.1e-05,6.1e-05
123,country_MDG,-5.5e-05,5.5e-05
134,country_MWI,-4.8e-05,4.8e-05
202,distribution_channel_Undefined,-2.8e-05,2.8e-05
130,country_MNE,-6e-06,6e-06


In [21]:
meaningful_features = coef_df[
    ~coef_df["feature"].str.startswith("country_")
]

# 위험 증가 요인 TOP 5
risk_up = meaningful_features[meaningful_features["coef"] > 0].head(5)

# 위험 감소 요인 TOP 5
risk_down = meaningful_features[meaningful_features["coef"] < 0].tail(5)

risk_up, risk_down


(                      feature      coef  abs_coef
 197  market_segment_Online TA  0.609736  0.609736
 219                 lead_time  0.435464  0.435464
 217   customer_type_Transient  0.362076  0.362076
 229    previous_cancellations  0.308095  0.308095
 234                       adr  0.265681  0.265681,
                             feature      coef  abs_coef
 204            reserved_room_type_B -0.000820  0.000820
 220               arrival_date_year -0.000734  0.000734
 209            reserved_room_type_G -0.000319  0.000319
 211            reserved_room_type_L -0.000134  0.000134
 202  distribution_channel_Undefined -0.000028  0.000028)