In [307]:
# ===============================================================
# Imports
# ===============================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC

from sklearn.base import TransformerMixin, BaseEstimator

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


In [308]:
# ===============================================================
# Load data
# ===============================================================

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train = train.dropna(subset=["spend_category"])#some columns had null!

# y.isna().sum()
# y.shape


In [309]:
train["spend_category"].value_counts()

spend_category
0.0    6245
1.0    4911
2.0    1464
Name: count, dtype: int64

In [310]:
train.drop_duplicates(inplace=True)
train

Unnamed: 0,trip_id,country,age_group,travel_companions,num_females,num_males,main_activity,visit_purpose,is_first_visit,mainland_stay_nights,...,food_included,domestic_transport_included,sightseeing_included,guide_included,insurance_included,days_booked_before_trip,arrival_weather,total_trip_days,has_special_requirements,spend_category
0,tour_idftaa27vp,FRANCE,45-64,With Spouse and Children,1.0,2.0,Beach Tourism,Leisure and Holidays,Yes,0,...,No,No,No,No,No,,"cloudy,",30+,,1.0
1,tour_iduck75m57,KENYA,45-64,Alone,1.0,0.0,Conference Tourism,Meetings and Conference,Yes,6,...,No,No,No,No,No,15-30,"sunny,",30+,,2.0
2,tour_id8y3w40h8,SOUTH AFRICA,25-44,With Other Friends/Relatives,2.0,0.0,Cultural Tourism,Meetings and Conference,No,4,...,No,No,No,No,No,90+,"sunny,",30+,none,2.0
3,tour_idkoh8mkgr,ITALY,25-44,With Spouse,1.0,1.0,Widlife Tourism,Leisure and Holidays,Yes,0,...,Yes,Yes,Yes,Yes,No,8-14,,,none,0.0
4,tour_idkmsfa00a,ITALY,25-44,With Spouse,1.0,1.0,Beach Tourism,Leisure and Holidays,Yes,0,...,Yes,No,No,No,No,90+,"sunny,",7-14,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12649,tour_idlln8s2r3,NETHERLANDS,25-44,With Other Friends/Relatives,0.0,2.0,Wildlife Tourism,Business,No,7,...,Yes,Yes,Yes,No,No,,Stormy,30+,,0.0
12650,tour_id3ta5n81h,KENYA,25-44,Alone,1.0,0.0,Hunting Tourism,Meetings and Conference,No,10,...,No,No,No,No,No,61-90,humid,7-14,,1.0
12651,tour_id88nvd60z,UNITED STATES OF AMERICA,45-64,Alone,0.0,1.0,Widlife Tourism,Scientific and Academic,No,9,...,No,No,No,No,No,90+,"sunny,",1-6,,1.0
12652,tour_iddw9van5i,ITALY,25-44,With Spouse,1.0,1.0,Widlife Tourism,Leisure and Holidays,Yes,7,...,Yes,Yes,Yes,Yes,Yes,61-90,"sunny,",7-14,,0.0


In [311]:
train.isna().sum()

trip_id                           0
country                         227
age_group                         8
travel_companions               733
num_females                       2
num_males                         4
main_activity                   128
visit_purpose                     0
is_first_visit                   98
mainland_stay_nights              0
island_stay_nights                0
tour_type                         0
intl_transport_included         146
info_source                       0
accomodation_included           141
food_included                   171
domestic_transport_included       0
sightseeing_included              0
guide_included                    0
insurance_included              235
days_booked_before_trip        1547
arrival_weather                3395
total_trip_days                 402
has_special_requirements       8086
spend_category                    0
dtype: int64

In [312]:
train.isna().sum()
null_value_percentages=(train.isna().sum()/train.shape[0])*100
null_value_percentages

trip_id                         0.000000
country                         1.798732
age_group                       0.063391
travel_companions               5.808241
num_females                     0.015848
num_males                       0.031696
main_activity                   1.014263
visit_purpose                   0.000000
is_first_visit                  0.776545
mainland_stay_nights            0.000000
island_stay_nights              0.000000
tour_type                       0.000000
intl_transport_included         1.156894
info_source                     0.000000
accomodation_included           1.117274
food_included                   1.354992
domestic_transport_included     0.000000
sightseeing_included            0.000000
guide_included                  0.000000
insurance_included              1.862124
days_booked_before_trip        12.258320
arrival_weather                26.901743
total_trip_days                 3.185420
has_special_requirements       64.072900
spend_category  

In [313]:
# i see that the null values are
rows_to_drop=null_value_percentages[null_value_percentages<2]
rows_to_drop

trip_id                        0.000000
country                        1.798732
age_group                      0.063391
num_females                    0.015848
num_males                      0.031696
main_activity                  1.014263
visit_purpose                  0.000000
is_first_visit                 0.776545
mainland_stay_nights           0.000000
island_stay_nights             0.000000
tour_type                      0.000000
intl_transport_included        1.156894
info_source                    0.000000
accomodation_included          1.117274
food_included                  1.354992
domestic_transport_included    0.000000
sightseeing_included           0.000000
guide_included                 0.000000
insurance_included             1.862124
spend_category                 0.000000
dtype: float64

In [314]:
rows_to_drop=rows_to_drop.keys()
rows_to_drop

Index(['trip_id', 'country', 'age_group', 'num_females', 'num_males',
       'main_activity', 'visit_purpose', 'is_first_visit',
       'mainland_stay_nights', 'island_stay_nights', 'tour_type',
       'intl_transport_included', 'info_source', 'accomodation_included',
       'food_included', 'domestic_transport_included', 'sightseeing_included',
       'guide_included', 'insurance_included', 'spend_category'],
      dtype='object')

In [315]:
for row in rows_to_drop:
    print("row:",row,"\tmissing values:\t",train[row].isna().sum())

row: trip_id 	missing values:	 0
row: country 	missing values:	 227
row: age_group 	missing values:	 8
row: num_females 	missing values:	 2
row: num_males 	missing values:	 4
row: main_activity 	missing values:	 128
row: visit_purpose 	missing values:	 0
row: is_first_visit 	missing values:	 98
row: mainland_stay_nights 	missing values:	 0
row: island_stay_nights 	missing values:	 0
row: tour_type 	missing values:	 0
row: intl_transport_included 	missing values:	 146
row: info_source 	missing values:	 0
row: accomodation_included 	missing values:	 141
row: food_included 	missing values:	 171
row: domestic_transport_included 	missing values:	 0
row: sightseeing_included 	missing values:	 0
row: guide_included 	missing values:	 0
row: insurance_included 	missing values:	 235
row: spend_category 	missing values:	 0


In [316]:
train.shape

(12620, 25)

In [317]:
for row in rows_to_drop:
    if(null_value_percentages[row]<2):
        train.drop(labels=train.index[train[row].isna()],inplace=True)

In [318]:
train

Unnamed: 0,trip_id,country,age_group,travel_companions,num_females,num_males,main_activity,visit_purpose,is_first_visit,mainland_stay_nights,...,food_included,domestic_transport_included,sightseeing_included,guide_included,insurance_included,days_booked_before_trip,arrival_weather,total_trip_days,has_special_requirements,spend_category
0,tour_idftaa27vp,FRANCE,45-64,With Spouse and Children,1.0,2.0,Beach Tourism,Leisure and Holidays,Yes,0,...,No,No,No,No,No,,"cloudy,",30+,,1.0
1,tour_iduck75m57,KENYA,45-64,Alone,1.0,0.0,Conference Tourism,Meetings and Conference,Yes,6,...,No,No,No,No,No,15-30,"sunny,",30+,,2.0
2,tour_id8y3w40h8,SOUTH AFRICA,25-44,With Other Friends/Relatives,2.0,0.0,Cultural Tourism,Meetings and Conference,No,4,...,No,No,No,No,No,90+,"sunny,",30+,none,2.0
3,tour_idkoh8mkgr,ITALY,25-44,With Spouse,1.0,1.0,Widlife Tourism,Leisure and Holidays,Yes,0,...,Yes,Yes,Yes,Yes,No,8-14,,,none,0.0
4,tour_idkmsfa00a,ITALY,25-44,With Spouse,1.0,1.0,Beach Tourism,Leisure and Holidays,Yes,0,...,Yes,No,No,No,No,90+,"sunny,",7-14,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12649,tour_idlln8s2r3,NETHERLANDS,25-44,With Other Friends/Relatives,0.0,2.0,Wildlife Tourism,Business,No,7,...,Yes,Yes,Yes,No,No,,Stormy,30+,,0.0
12650,tour_id3ta5n81h,KENYA,25-44,Alone,1.0,0.0,Hunting Tourism,Meetings and Conference,No,10,...,No,No,No,No,No,61-90,humid,7-14,,1.0
12651,tour_id88nvd60z,UNITED STATES OF AMERICA,45-64,Alone,0.0,1.0,Widlife Tourism,Scientific and Academic,No,9,...,No,No,No,No,No,90+,"sunny,",1-6,,1.0
12652,tour_iddw9van5i,ITALY,25-44,With Spouse,1.0,1.0,Widlife Tourism,Leisure and Holidays,Yes,7,...,Yes,Yes,Yes,Yes,Yes,61-90,"sunny,",7-14,,0.0


In [319]:
columns_to_drop=null_value_percentages[null_value_percentages>40]
columns_to_drop

has_special_requirements    64.0729
dtype: float64

In [320]:
columns_to_drop=columns_to_drop.keys()
columns_to_drop
train.drop(columns=columns_to_drop,inplace=True)
train.shape

(11505, 24)

#now only need to deal with 4 columns with null values

In [321]:
train.isna().sum()

trip_id                           0
country                           0
age_group                         0
travel_companions               667
num_females                       0
num_males                         0
main_activity                     0
visit_purpose                     0
is_first_visit                    0
mainland_stay_nights              0
island_stay_nights                0
tour_type                         0
intl_transport_included           0
info_source                       0
accomodation_included             0
food_included                     0
domestic_transport_included       0
sightseeing_included              0
guide_included                    0
insurance_included                0
days_booked_before_trip        1408
arrival_weather                3094
total_trip_days                 371
spend_category                    0
dtype: int64

In [322]:
train.drop_duplicates(inplace=True)
train.shape

(11505, 24)

In [323]:
null_value_percentages=(train.isna().sum()/train.shape[0])*100
null_value_percentages=null_value_percentages[null_value_percentages>0]
columns_to_impute=null_value_percentages.keys()
columns_to_impute

Index(['travel_companions', 'days_booked_before_trip', 'arrival_weather',
       'total_trip_days'],
      dtype='object')

In [324]:
for column in columns_to_impute:
	if(train[column].dtype== object):
		print("Column: ",column,"\tCounts:\n", train[column].value_counts(dropna=False))
		print("\n\n")

Column:  travel_companions 	Counts:
 travel_companions
Alone                           4651
With Spouse                     3055
With Other Friends/Relatives    2029
With Spouse and Children         773
NaN                              667
With Children                    330
Name: count, dtype: int64



Column:  days_booked_before_trip 	Counts:
 days_booked_before_trip
61-90     3862
90+       3333
31-60     1499
NaN       1408
15-30      741
8-14       381
1-7        281
Name: count, dtype: int64



Column:  arrival_weather 	Counts:
 arrival_weather
NaN        3094
sunny,     2915
Rainy      1681
cloudy,     957
other       872
Windy,      772
Stormy      713
humid       501
Name: count, dtype: int64



Column:  total_trip_days 	Counts:
 total_trip_days
7-14     3859
30+      3749
1-6      2369
15-30    1157
NaN       371
Name: count, dtype: int64





In [325]:
# i will replace the missing values here with the max occuring class
train["travel_companions"].replace(to_replace=np.nan,value="Alone",inplace=True)
train["days_booked_before_trip"].replace(to_replace=np.nan,value="61-90",inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["travel_companions"].replace(to_replace=np.nan,value="Alone",inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["days_booked_before_trip"].replace(to_replace=np.nan,value="61-90",inplace=True)


In [326]:
train.isna().sum()

trip_id                           0
country                           0
age_group                         0
travel_companions                 0
num_females                       0
num_males                         0
main_activity                     0
visit_purpose                     0
is_first_visit                    0
mainland_stay_nights              0
island_stay_nights                0
tour_type                         0
intl_transport_included           0
info_source                       0
accomodation_included             0
food_included                     0
domestic_transport_included       0
sightseeing_included              0
guide_included                    0
insurance_included                0
days_booked_before_trip           0
arrival_weather                3094
total_trip_days                 371
spend_category                    0
dtype: int64

In [327]:
#arrival weather clearly doesn't matter i think
train
train.drop(columns=["arrival_weather"],inplace=True)

In [328]:
train["total_trip_days"].value_counts(dropna=False)

total_trip_days
7-14     3859
30+      3749
1-6      2369
15-30    1157
NaN       371
Name: count, dtype: int64

In [329]:
# train.

In [330]:
train["island_stay_nights"].value_counts()


island_stay_nights
0      7107
7       978
4       658
3       583
5       531
6       390
2       256
8       184
10      155
14      127
9       117
1        99
11       49
12       49
13       39
15       36
20       26
17       17
21       16
16       15
19        9
18        6
60        5
22        5
30        5
28        5
35        5
23        4
25        4
33        2
27        2
40        2
41        2
43        1
48        1
240       1
42        1
75        1
80        1
39        1
55        1
51        1
44        1
31        1
120       1
90        1
61        1
64        1
34        1
32        1
Name: count, dtype: int64

In [331]:
train["mainland_stay_nights"].value_counts()

mainland_stay_nights
0      1496
7      1017
2       807
5       803
4       801
       ... 
130       1
248       1
85        1
182       1
279       1
Name: count, Length: 99, dtype: int64

In [332]:
def infer_trip_days(row):
    if pd.notna(row["total_trip_days"]):
        return row["total_trip_days"]
    
    total = row["mainland_stay_nights"] + row["island_stay_nights"]
    
    if total <= 6:
        return "1-6"
    elif total <= 14:
        return "7-14"
    elif total <= 30:
        return "15-30"
    else:
        return "30+"

train["total_trip_days"] = train.apply(infer_trip_days, axis=1)
test["total_trip_days"] = test.apply(infer_trip_days, axis=1)


In [333]:
train.isna().sum()

trip_id                        0
country                        0
age_group                      0
travel_companions              0
num_females                    0
num_males                      0
main_activity                  0
visit_purpose                  0
is_first_visit                 0
mainland_stay_nights           0
island_stay_nights             0
tour_type                      0
intl_transport_included        0
info_source                    0
accomodation_included          0
food_included                  0
domestic_transport_included    0
sightseeing_included           0
guide_included                 0
insurance_included             0
days_booked_before_trip        0
total_trip_days                0
spend_category                 0
dtype: int64

In [334]:
test.isna().sum()

trip_id                           0
country                         126
age_group                         0
travel_companions               338
num_females                       0
num_males                         2
main_activity                    65
visit_purpose                     0
is_first_visit                   39
mainland_stay_nights              0
island_stay_nights                0
tour_type                         0
intl_transport_included          57
info_source                       0
accomodation_included            62
food_included                    78
domestic_transport_included       0
sightseeing_included              0
guide_included                    0
insurance_included              107
days_booked_before_trip         667
arrival_weather                1593
total_trip_days                   0
has_special_requirements       3726
dtype: int64

In [335]:
train[["mainland_stay_nights","island_stay_nights","total_trip_days"]]

Unnamed: 0,mainland_stay_nights,island_stay_nights,total_trip_days
0,0,7,30+
1,6,0,30+
2,4,2,30+
3,0,7,7-14
4,0,7,7-14
...,...,...,...
12649,7,0,30+
12650,10,0,7-14
12651,9,0,1-6
12652,7,7,7-14


In [336]:
# 1. Drop the same high-null columns
test = test.drop(columns=columns_to_drop)

# 2. Impute the two columns where you manually chose a value
test["travel_companions"].fillna("Alone", inplace=True)
test["days_booked_before_trip"].fillna("61-90", inplace=True)

# 3. Handle the columns where training rows were dropped (<2% null)
for col in rows_to_drop:  # these had <2% null in train
    if train[col].dtype == "object":
        # Fill with training mode
        test[col].fillna(train[col].mode()[0], inplace=True)
    else:
        # Fill with training median
        test[col].fillna(train[col].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["travel_companions"].fillna("Alone", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["days_booked_before_trip"].fillna("61-90", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

KeyError: 'spend_category'

preprocess!!

In [None]:

X = train.drop(columns=["spend_category","trip_id"])
y = train["spend_category"]

X.head()

In [None]:
# ===============================================================
# Identify feature types
# ===============================================================

categorical_cols = [col for col in X.columns if X[col].dtype == "object"]

num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()

print("Categorical:", categorical_cols)
print("Numerical:", num_cols)


In [None]:
# ===============================================================
# Preprocessing pipeline
# ===============================================================

# preprocess = ColumnTransformer(
#     transformers=[
#         ("cat", MultiColumnLabelEncoder(), categorical_cols),
#         ("num", StandardScaler(), num_cols)
#     ],
#     remainder="passthrough"
# )
from sklearn.preprocessing import OneHotEncoder

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), num_cols)
    ]
)
preprocess


In [None]:
# ===============================================================
# Split train/val
# ===============================================================

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

X_train.shape, X_val.shape


In [None]:
# ===============================================================
# SVM Pipeline
# ===============================================================

svm_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("svm", SVC(kernel="rbf", C=3, gamma="scale"))
])


In [None]:
# ===============================================================
# Train SVM
# ===============================================================

# svm_pipeline.fit(X_train, y_train)
svm_pipeline.fit(X, y) # using entire dataset this time

svm_pred_val = svm_pipeline.predict(X_val)
svm_acc = accuracy_score(y_val, svm_pred_val)

print("SVM Validation Accuracy:", svm_acc)


In [None]:
# X_train

In [None]:
from sklearn.model_selection import GridSearchCV

# parameters to try
param_grid_svm = {
    "svm__kernel": ["rbf", "linear"],
    "svm__C": [0.1, 1, 3, 10],
    "svm__gamma": ["scale", 0.01, 0.1, 1],
    "svm__class_weight": [None, "balanced"]
}

# GridSearchCV
grid_svm = GridSearchCV(
    estimator=svm_pipeline,
    param_grid=param_grid_svm,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit
grid_svm.fit(X_train, y_train)

# Best parameters
print("Best SVM params:", grid_svm.best_params_)
print("Best SVM CV score:", grid_svm.best_score_)

# Validation accuracy
svm_val_pred = grid_svm.predict(X_val)
from sklearn.metrics import accuracy_score
print("Validation accuracy:", accuracy_score(y_val, svm_val_pred))


In [None]:
train.isna().sum()

In [None]:
train.columns
test[train.drop(columns=["spend_category"]).columns].isna().sum() # 

In [None]:
# ===============================================================
# SVM Submission
# ===============================================================

svm_test_pred = svm_pipeline.predict(test)

sub_svm = pd.DataFrame({
    "trip_id": test["trip_id"],
    "spend_category": svm_test_pred
})

sub_svm.to_csv("svm_submission.csv", index=False)

sub_svm.head()


In [None]:
# ===============================================================
# Build Keras model for pipeline
# ===============================================================

def create_model():
    model = Sequential([
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        Dense(64, activation='relu'),
        BatchNormalization(),

        Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model


In [None]:
# ===============================================================
# Neural Network Pipeline
# ===============================================================

# nn_pipeline = Pipeline(steps=[
#     ("preprocess", preprocess),
#     ("nn", KerasClassifier(
#         build_fn=create_model,
#         epochs=25,
#         batch_size=256,
#         verbose=1
#     ))
# ])
from sklearn.neural_network import MLPClassifier

# Neural network pipeline
nn_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),  # your preprocessing pipeline
    ("nn", MLPClassifier(
        hidden_layer_sizes=(256, 128, 64),
        activation="relu",
        solver="adam",
        max_iter=100,
        random_state=42
    ))
])



In [None]:
# ===============================================================
# Train NN
# ===============================================================

nn_pipeline.fit(X_train, y_train)

nn_pred_val = nn_pipeline.predict(X_val)
nn_acc = accuracy_score(y_val, nn_pred_val)

print("Neural Network Validation Accuracy:", nn_acc)


In [None]:
# ===============================================================
# NN Submission
# ===============================================================

nn_test_pred = nn_pipeline.predict(test)

sub_nn = pd.DataFrame({
    "trip_id": test["trip_id"],
    "spend_category": nn_test_pred
})

sub_nn.to_csv("nn_submission.csv", index=False)

sub_nn.head()
