# Advanced Tourism Spending Classification
## Models: Deep Neural Network (PyTorch/CUDA) & SVM
**Goal:** Predict `spend_category` based on trip details.
**Hardware:** GPU Acceleration enabled.

In [24]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [25]:
# --- LOAD DATA ---
# Replace 'your_dataset.csv' with your actual file path
try:
    df = pd.read_csv('travel/train.csv') 
except FileNotFoundError:
    print("Error: CSV file not found. Please ensure the data is loaded into the variable 'df'.")

# Initial Cleanup: Drop ID and rows where Target is missing
df = df.drop(columns=['trip_id'])
df = df.dropna(subset=['spend_category'])

# Separate Features and Target
X = df.drop('spend_category', axis=1)
y = df['spend_category']

In [33]:
df.head()

Unnamed: 0,country,age_group,travel_companions,num_females,num_males,main_activity,visit_purpose,is_first_visit,mainland_stay_nights,island_stay_nights,...,food_included,domestic_transport_included,sightseeing_included,guide_included,insurance_included,days_booked_before_trip,arrival_weather,total_trip_days,has_special_requirements,spend_category
0,FRANCE,45-64,With Spouse and Children,1.0,2.0,Beach Tourism,Leisure and Holidays,Yes,0,7,...,No,No,No,No,No,,"cloudy,",30+,,1.0
1,KENYA,45-64,Alone,1.0,0.0,Conference Tourism,Meetings and Conference,Yes,6,0,...,No,No,No,No,No,15-30,"sunny,",30+,,2.0
2,SOUTH AFRICA,25-44,With Other Friends/Relatives,2.0,0.0,Cultural Tourism,Meetings and Conference,No,4,2,...,No,No,No,No,No,90+,"sunny,",30+,none,2.0
3,ITALY,25-44,With Spouse,1.0,1.0,Widlife Tourism,Leisure and Holidays,Yes,0,7,...,Yes,Yes,Yes,Yes,No,8-14,,,none,0.0
4,ITALY,25-44,With Spouse,1.0,1.0,Beach Tourism,Leisure and Holidays,Yes,0,7,...,Yes,No,No,No,No,90+,"sunny,",7-14,,0.0


In [26]:
# --- HEAVY PREPROCESSING ---

# 1. Identify Column Types
numeric_features = ['num_females', 'num_males', 'mainland_stay_nights', 'island_stay_nights']
categorical_features = [
    'country', 'age_group', 'travel_companions', 'main_activity', 
    'visit_purpose', 'is_first_visit', 'tour_type', 'intl_transport_included', 
    'info_source', 'accomodation_included', 'food_included', 
    'domestic_transport_included', 'sightseeing_included', 'guide_included', 
    'insurance_included', 'has_special_requirements'
]

# 2. Define Transformers
# Numeric: Impute missing with median -> Scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical: Impute missing with 'Unknown' -> OneHotEncode
# We use 'Unknown' because 'has_special_requirements' has massive missingness likely meaning 'None'
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 3. Combine into Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 4. Apply Transformations
X_processed = preprocessor.fit_transform(X)

# 5. Encode Target (Ensure categories are 0, 1, 2... for PyTorch)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 6. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.2, random_state=42)

print(f"Processed Feature Shape: {X_train.shape}")
print(f"Target Classes: {label_encoder.classes_}")

Processed Feature Shape: (10096, 195)
Target Classes: [0. 1. 2.]


In [27]:
# --- PYTORCH SETUP (GPU) ---

# Convert to Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)

# Create DataLoaders for batching
BATCH_SIZE = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [28]:
# --- MODEL 1: DEEP NEURAL NETWORK ---

class HeavyNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(HeavyNet, self).__init__()
        self.layer1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.layer2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.layer3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.layer4 = nn.Linear(128, 64)
        self.output = nn.Linear(64, output_dim)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.3)
        
    def forward(self, x):
        x = self.relu(self.bn1(self.layer1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.layer2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.layer3(x)))
        x = self.dropout(x)
        x = self.relu(self.layer4(x))
        x = self.output(x)
        return x

input_dim = X_train.shape[1]
output_dim = len(np.unique(y_encoded))

model = HeavyNet(input_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

print(model)

HeavyNet(
  (layer1): Linear(in_features=195, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer4): Linear(in_features=128, out_features=64, bias=True)
  (output): Linear(in_features=64, out_features=3, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
)


In [29]:
# --- TRAINING LOOP (PYTORCH) ---
EPOCHS = 100

loss_history = []

model.train()
for epoch in range(EPOCHS):
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_acc = 100 * correct / total
    loss_history.append(running_loss/len(train_loader))
    if (epoch+1) % 5 == 0:
        print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {running_loss/len(train_loader):.4f}, Acc: {epoch_acc:.2f}%")

Epoch [5/100], Loss: 0.5648, Acc: 76.27%
Epoch [10/100], Loss: 0.5155, Acc: 77.99%
Epoch [15/100], Loss: 0.4760, Acc: 80.21%
Epoch [20/100], Loss: 0.4336, Acc: 81.18%
Epoch [25/100], Loss: 0.4042, Acc: 82.44%
Epoch [30/100], Loss: 0.3861, Acc: 83.38%
Epoch [35/100], Loss: 0.3579, Acc: 84.80%
Epoch [40/100], Loss: 0.3381, Acc: 85.54%
Epoch [45/100], Loss: 0.3146, Acc: 86.16%
Epoch [50/100], Loss: 0.3044, Acc: 86.69%
Epoch [55/100], Loss: 0.2886, Acc: 87.29%
Epoch [60/100], Loss: 0.2742, Acc: 88.58%
Epoch [65/100], Loss: 0.2550, Acc: 89.07%
Epoch [70/100], Loss: 0.2616, Acc: 88.91%
Epoch [75/100], Loss: 0.2450, Acc: 89.78%
Epoch [80/100], Loss: 0.2425, Acc: 89.88%
Epoch [85/100], Loss: 0.2364, Acc: 89.96%
Epoch [90/100], Loss: 0.2166, Acc: 90.89%
Epoch [95/100], Loss: 0.2244, Acc: 90.58%
Epoch [100/100], Loss: 0.2159, Acc: 91.18%


In [30]:
# --- EVALUATION (PYTORCH) ---
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("--- Neural Network Results ---")
print(classification_report(all_labels, all_preds, target_names=[str(c) for c in label_encoder.classes_]))

--- Neural Network Results ---
              precision    recall  f1-score   support

         0.0       0.78      0.84      0.81      1174
         1.0       0.69      0.64      0.67      1076
         2.0       0.47      0.43      0.45       274

    accuracy                           0.71      2524
   macro avg       0.65      0.64      0.64      2524
weighted avg       0.71      0.71      0.71      2524



In [31]:
# --- MODEL 2: SVM (Support Vector Machine) ---
# Note: Sklearn SVM does not run on GPU natively, but we use the preprocessed data.
# We use RBF kernel which handles non-linearity well.

print("Training SVM... (This might take a moment)")
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)

svm_preds = svm_model.predict(X_test)

print("--- SVM Results ---")
print(classification_report(y_test, svm_preds, target_names=[str(c) for c in label_encoder.classes_]))

Training SVM... (This might take a moment)
--- SVM Results ---
              precision    recall  f1-score   support

         0.0       0.79      0.86      0.82      1174
         1.0       0.71      0.74      0.72      1076
         2.0       0.78      0.31      0.45       274

    accuracy                           0.75      2524
   macro avg       0.76      0.64      0.66      2524
weighted avg       0.75      0.75      0.74      2524



In [32]:
# 1. Load Test Data
# Ensure this file exists and has the same columns as training data (minus the target)
test_df = pd.read_csv('travel/test.csv') 

# 2. Isolate IDs for the final file
submission_ids = test_df['trip_id']

# 3. Preprocess Test Data
# MUST use .transform() to use the exact same scaling/encoding as the training set
X_test_raw = test_df.drop('trip_id', axis=1)
X_test_processed = preprocessor.transform(X_test_raw)

# --- NEURAL NETWORK PREDICTION ---
model.eval()
X_tensor_sub = torch.tensor(X_test_processed, dtype=torch.float32).to(device)

with torch.no_grad():
    outputs = model(X_tensor_sub)
    _, predicted_indices = torch.max(outputs, 1)
    # Move to CPU and convert to numpy
    nn_preds_indices = predicted_indices.cpu().numpy()

# Convert numeric predictions back to original labels (e.g., 'High', 'Low', etc.)
nn_final_preds = label_encoder.inverse_transform(nn_preds_indices)

# --- SVM PREDICTION ---
svm_preds_indices = svm_model.predict(X_test_processed)
svm_final_preds = label_encoder.inverse_transform(svm_preds_indices)

# --- SAVE OUTPUTS ---
# Save Neural Network Submission
pd.DataFrame({
    'trip_id': submission_ids,
    'spend_category': nn_final_preds
}).to_csv('travel_sub/submission_nn.csv', index=False)

# Save SVM Submission
pd.DataFrame({
    'trip_id': submission_ids,
    'spend_category': svm_final_preds
}).to_csv('travel_sub/submission_svm.csv', index=False)

print("Success: 'submission_nn.csv' and 'submission_svm.csv' generated.")

Success: 'submission_nn.csv' and 'submission_svm.csv' generated.
