In [15]:
import pandas as pd
import numpy as np

# -----------------------------------------
# 1. Compute null stats & preprocessing decisions
# -----------------------------------------
def compute_preprocessing_rules(train):

    # Drop rows where target missing
    train = train.dropna(subset=["spend_category"]).copy()

    # % nulls
    null_pct = (train.isna().sum() / len(train)) * 100

    # Rule 1: rows to drop (<2%)
    rows_to_drop = null_pct[null_pct < 2].index.tolist()

    # Rule 2: columns to drop (>40%)
    columns_to_drop = null_pct[null_pct > 40].index.tolist()
    columns_to_drop.append("arrival_weather")# this feature makes no sense to me
    # Rule 3: categorical imputations chosen manually
    cat_impute_rules = {
        "travel_companions": "Alone",
        "days_booked_before_trip": "61-90"
    }

    return rows_to_drop, columns_to_drop, cat_impute_rules


In [16]:
def infer_trip_days(row):
    if pd.notna(row["total_trip_days"]):
        return row["total_trip_days"]

    total = row["mainland_stay_nights"] + row["island_stay_nights"]

    if total <= 6:
        return "1-6"
    elif total <= 14:
        return "7-14"
    elif total <= 30:
        return "15-30"
    else:
        return "30+"


In [17]:
def preprocess_train(train, rows_to_drop, columns_to_drop, cat_impute_rules):

    train = train.copy()

    # Drop rows with target missing
    train = train.dropna(subset=["spend_category"])

    # Drop duplicates
    train = train.drop_duplicates()

    # Drop rows (<2% null)
    for col in rows_to_drop:
        train = train[train[col].notna()]

    # Drop columns (>40% null)
    train = train.drop(columns=columns_to_drop, errors="ignore")

    # Manually chosen imputations
    for col, fill_value in cat_impute_rules.items():
        if col in train.columns:
            train[col] = train[col].fillna(fill_value)

    # Drop arrival_weather
    # if "arrival_weather" in train.columns:
    #     train = train.drop(columns=["arrival_weather"])

    # Infer total_trip_days
    train["total_trip_days"] = train.apply(infer_trip_days, axis=1)

    return train


In [18]:
def preprocess_test(test, train, rows_to_drop, columns_to_drop, cat_impute_rules):

    test = test.copy()

    # Drop same columns as train
    test = test.drop(columns=columns_to_drop, errors="ignore")

    # Same categorical imputations
    for col, fill_value in cat_impute_rules.items():
        if col in test.columns:
            test[col] = test[col].fillna(fill_value)

    # Columns where train dropped rows (<2%)
    for col in rows_to_drop:
        if col not in test.columns:
            continue

        if train[col].dtype == "object":
            test[col] = test[col].fillna(train[col].mode()[0])
        else:
            test[col] = test[col].fillna(train[col].median())

    # Infer total_trip_days
    test["total_trip_days"] = test.apply(infer_trip_days, axis=1)

    return test


In [19]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Compute rules from train
rows_to_drop, columns_to_drop, cat_impute_rules = compute_preprocessing_rules(train)

# Apply preprocessing
train_clean = preprocess_train(train, rows_to_drop, columns_to_drop, cat_impute_rules)
test_clean = preprocess_test(test, train_clean, rows_to_drop, columns_to_drop, cat_impute_rules)

print(train_clean.isna().sum())
print(test_clean.isna().sum())


trip_id                        0
country                        0
age_group                      0
travel_companions              0
num_females                    0
num_males                      0
main_activity                  0
visit_purpose                  0
is_first_visit                 0
mainland_stay_nights           0
island_stay_nights             0
tour_type                      0
intl_transport_included        0
info_source                    0
accomodation_included          0
food_included                  0
domestic_transport_included    0
sightseeing_included           0
guide_included                 0
insurance_included             0
days_booked_before_trip        0
total_trip_days                0
spend_category                 0
dtype: int64
trip_id                        0
country                        0
age_group                      0
travel_companions              0
num_females                    0
num_males                      0
main_activity                 

In [20]:
X = train_clean.drop(columns=["spend_category", "trip_id"])
y = train_clean["spend_category"]

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()

preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ("num", StandardScaler(), num_cols)
])

svm_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("svm", SVC(kernel="rbf", C=3, gamma="scale", class_weight="balanced"))
])

svm_pipeline.fit(X_train, y_train)
pred = svm_pipeline.predict(X_val)
print("Val accuracy:", accuracy_score(y_val, pred))


Val accuracy: 0.7292481529769665


In [21]:
# %%
# ===============================================================
# SVM Submission
# ===============================================================

svm_test_pred = svm_pipeline.predict(test_clean)

sub_svm = pd.DataFrame({
    "trip_id": test["trip_id"],
    "spend_category": svm_test_pred
})

sub_svm.to_csv("svm_submission.csv", index=False)

sub_svm.head()

Unnamed: 0,trip_id,spend_category
0,tour_id8gzpck76,2.0
1,tour_idow1zxkou,0.0
2,tour_idue7esfqz,0.0
3,tour_idnj3mjzpb,0.0
4,tour_ida3us5yk2,0.0


In [23]:
# ===============================================================
# Train final model on full training data (after selecting best model)
# ===============================================================

final_svm_pipeline = svm_pipeline  # after hyperparameter tuning

# fit on the FULL cleaned training data
final_svm_pipeline.fit(X,y)

# ===============================================================
# Predict on test data
# ===============================================================

svm_test_pred = final_svm_pipeline.predict(test_clean)

sub_svm = pd.DataFrame({
    "trip_id": test["trip_id"],
    "spend_category": svm_test_pred
})

sub_svm.to_csv("svm_submission.csv", index=False)
