<a href="https://colab.research.google.com/github/staerkjoe/QuantRisk_Colab/blob/main/TicketCancellation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("pkdarabi/classification-of-travel-purpose")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'classification-of-travel-purpose' dataset.
Path to dataset files: /kaggle/input/classification-of-travel-purpose


Link to dataset:
https://www.kaggle.com/datasets/pkdarabi/classification-of-travel-purpose

In [None]:
!pip install unidecode



In [None]:
import tensorflow as tf
import tf_keras as keras
import pandas as pd
# note: tensorflow 2.17 onwards uses tf.keras 3.0 internally,
# which seem to be still bugged and inconsistent.
# We therefore use tf_keras in according to tensorflow version.

import numpy as np
import os
import time
from unidecode import unidecode

In [None]:
for file in os.listdir(path):
    print(file)

train_data.csv


In [None]:
df = pd.read_csv(os.path.join(path, "train_data.csv"))

In [None]:
# drop columns that are unnecessary/not helpful for prediction

# List of columns to drop
drop_cols = [
    'BillID',               # Unique identifier
    'TicketID',             # Unique identifier
    'UserID',               # Sparse and semi-unique
    'BuyerMobile',          # Personal identifier
    'NationalCode',         # Personal identifier
    'HashPassportNumber_p', # Sparse and hashed
    'HashEmail',            # Sparse and hashed
    'CancelTime',           # Leaks target label
    'VehicleType'
]

# Drop columns from DataFrame
df = df.drop(columns=drop_cols)

# Optional: Confirm result
print("Remaining columns:", df.columns.tolist())




Remaining columns: ['Created', 'DepartureTime', 'ReserveStatus', 'Male', 'Price', 'CouponDiscount', 'From', 'To', 'Domestic', 'VehicleClass', 'TripReason', 'Vehicle', 'Cancel']


In [None]:
# Drop Nulla values for From and TO
df = df.dropna(subset=['From', 'To'])

# Combine 'From' and 'To' columns to create a list of all cities
all_cities = df['From'].tolist() + df['To'].tolist()

# Create a set of unique cities and convert it back to a list
unique_cities = list(set(all_cities))

# print(unique_cities)

# Mapping erstellen
city_mapping = {city: unidecode(city) for city in unique_cities}

'''
# Ausgabe als Dictionary
print("city_mapping = {")
for k, v in city_mapping.items():
    print(f"    '{k}': '{v}',")
print("}")
'''

'\n# Ausgabe als Dictionary\nprint("city_mapping = {")\nfor k, v in city_mapping.items():\n    print(f"    \'{k}\': \'{v}\',")\nprint("}")\n'

In [None]:
def normalize_city_name(name):
    if pd.isna(name):
        return None
    return (
        str(name)
        .strip()
        .replace('\u200c', '')  # remove zero-width non-joiner
        .replace(' (', '(')
        .replace(')', ')')
        .replace('‌', '')       # remove zero-width joiner
    )

def safe_unidecode(value):
    if pd.isna(value):
        return ""
    return unidecode(str(value))

df['From'] = df['From'].apply(normalize_city_name).map(city_mapping).apply(safe_unidecode)
df['To'] = df['To'].apply(normalize_city_name).map(city_mapping).apply(safe_unidecode)

df['FromMapped'] = df['From'].isin(city_mapping.values())
df['ToMapped'] = df['To'].isin(city_mapping.values())

# keep only rows where both 'From' and 'To' were mapped
df = df[df["FromMapped"] & df["ToMapped"]].copy()
# drop mapping flags
df.drop(columns=["FromMapped", "ToMapped"], inplace=True)

In [None]:
df['Price'].describe()

Unnamed: 0,Price
count,91672.0
mean,3333467.0
std,7696764.0
min,-1514000.0
25%,1025000.0
50%,1680000.0
75%,3150000.0
max,383764600.0


In [None]:
q25 = df['Price'].quantile(0.25)
q75 = df['Price'].quantile(0.75)

df = df[(df['Price'] >= q25) & (df['Price'] <= q75)].copy()



In [None]:
# Columns Transformations

# Date FOrmat
df[['Created', 'DepartureTime']] = df[['Created', 'DepartureTime']].apply(pd.to_datetime)

# Price to €
exchange_rate = 0.00002027  # 1 IRR = 0.00002027 EUR
df['Price'] = (df['Price'] * exchange_rate).round(2)
df['CouponDiscount'] = (df['CouponDiscount'] * exchange_rate).round(2)


In [None]:
# Fearure Engineering
df['LeadTime'] = df['DepartureTime'] - df['Created']
df['IsWeekend'] = df['DepartureTime'].dt.weekday >= 5
df['DiscountRatio'] = np.where(df['Price'] > 0, df['CouponDiscount'] / df['Price'], 0)

In [None]:
# Rename the column
df.rename(columns={'VehicleClass': 'VIPStatus'}, inplace=True)

# Replace NaN values with 'Unknown'
df['VIPStatus'] = df['VIPStatus'].fillna('Unknown')


In [None]:
df.head()

Unnamed: 0,Created,DepartureTime,ReserveStatus,Male,Price,CouponDiscount,Domestic,VIPStatus,TripReason,Vehicle,Cancel,LeadTime,IsWeekend,DiscountRatio,FromFreq,ToFreq
2,2022-09-20 17:25:27.250,2022-09-21 11:00:00,3,False,40.54,0.0,1,True,Work,Bus,0,0 days 17:34:32.750000,False,0.0,3394,1311
7,2022-07-28 13:59:44.843,2022-07-28 16:00:00,3,True,50.27,0.0,1,True,Work,Bus,0,0 days 02:00:15.157000,False,0.0,14641,2671
8,2022-10-23 09:57:18.867,2022-10-24 02:00:00,3,True,24.73,0.0,1,True,Work,Bus,0,0 days 16:02:41.133000,False,0.0,14641,3607
9,2022-03-24 22:16:40.203,2022-03-28 13:00:00,3,True,26.35,0.0,1,True,Int,Bus,0,3 days 14:43:19.797000,False,0.0,14641,1327
14,2022-09-13 20:13:39.853,2022-09-30 20:55:00,2,True,23.48,0.0,1,Unknown,Int,Train,0,17 days 00:41:20.147000,False,0.0,1759,15161


In [None]:
# use of frequency encoding since there is a lot of different cities and other encodings would not make sense
# loss of interpretability though

df['FromFreq'] = df['From'].map(df['From'].value_counts())
df['ToFreq'] = df['To'].map(df['To'].value_counts())

# drop the city columns
df.drop(columns=['From', 'To'], inplace=True)

In [None]:
df['DiscountRatio'].describe()

Unnamed: 0,DiscountRatio
count,45901.0
mean,0.001288
std,0.015095
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.113043


In [None]:
df['Vehicle'].value_counts()

Unnamed: 0_level_0,count
Vehicle,Unnamed: 1_level_1
Bus,26712
Train,19001
Plane,184
InternationalPlane,4


In [162]:
# Create a copy of the cleaned DataFrame
cleaned_data = df.copy()

In [None]:
cleaned_data['Cancel'].value_counts()

Unnamed: 0_level_0,count
Cancel,Unnamed: 1_level_1
0,38291
1,7610


In [163]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


scaler = StandardScaler()

categorical_cols = ['TripReason', 'VIPStatus','Vehicle']

encoder = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')


In [164]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='Cancel')  # Features
y = df['Cancel']               # Target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

In [None]:
model = LogisticRegression(
    penalty='l2',         # or 'l1', 'elasticnet', 'none'
    solver='liblinear',   # good for small datasets and 'l1' penalty
    max_iter=1000,        # increase if convergence warnings appear
    class_weight='balanced',  # handles class imbalance automatically
    random_state=42       # ensures reproducibility
)

In [None]:
pipeline = Pipeline([
    ('preprocessing', encoder),
    ('scaling', scaler),
    ('smote', smote),
    ('model', model)
])

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__penalty': ['l1','l2'],
    'model__solver': ['liblinear'],
}

# GridSearchCV setup
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,                # 5-fold cross-validation
    scoring='accuracy',  # or use another metric
    n_jobs=-1            # use all available cores
)

In [None]:
# Fit grid search on your data
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)