In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load dataset
data = pd.read_excel('C:/Users/Saima Modak/Capstone Projects/Tourism Analysis/Datasets/Final Dataset.xlsx')

In [3]:
# Advanced feature engineering for visit mode prediction
def create_advanced_features(df):
    """Create sophisticated features that better predict visit mode"""
    
    # Create a copy to avoid modifying original
    df_enhanced = df.copy()
    
    # 1. USER BEHAVIOR PATTERNS
    # User's preferred travel seasons
    user_seasonal_preference = df.groupby(['UserId', 'VisitSeason']).size().reset_index(name='count')
    user_dominant_season = user_seasonal_preference.sort_values('count', ascending=False).groupby('UserId').first()
    df_enhanced = df_enhanced.merge(user_dominant_season[['VisitSeason']], left_on='UserId', right_index=True, how='left', suffixes=('', '_preferred'))
    
    # User's visit mode percentages - fixed for pandas warning
    user_visit_mode_freq = df.groupby(['UserId', 'VisitMode']).size().reset_index(name='count')
    user_visit_totals = user_visit_mode_freq.groupby('UserId')['count'].sum().reset_index(name='total')
    user_visit_mode_freq = user_visit_mode_freq.merge(user_visit_totals, on='UserId')
    user_visit_mode_freq['pct'] = user_visit_mode_freq['count'] / user_visit_mode_freq['total']
    user_visit_mode_pct = user_visit_mode_freq.pivot(index='UserId', columns='VisitMode', values='pct').fillna(0)
    
    for col in user_visit_mode_pct.columns:
        df_enhanced = df_enhanced.merge(user_visit_mode_pct[[col]], left_on='UserId', right_index=True, how='left')
        df_enhanced.rename(columns={col: f'user_pct_{col}'}, inplace=True)
    
    # 2. TEMPORAL PATTERNS
    # Visit mode by month patterns
    monthly_mode_dist = pd.crosstab(df['VisitMonth'], df['VisitMode'], normalize='index')
    for mode in monthly_mode_dist.columns:
        month_mode_prob = dict(zip(monthly_mode_dist.index, monthly_mode_dist[mode]))
        df_enhanced[f'month_mode_prob_{mode}'] = df_enhanced['VisitMonth'].map(month_mode_prob)
    
    # Cyclical encoding for months
    df_enhanced['sin_month'] = np.sin(2 * np.pi * df_enhanced['VisitMonth'] / 12)
    df_enhanced['cos_month'] = np.cos(2 * np.pi * df_enhanced['VisitMonth'] / 12)
    
    # 3. GEOGRAPHIC PATTERNS
    # Visit mode by demographic patterns
    continent_mode_dist = pd.crosstab(df['Continent'], df['VisitMode'], normalize='index')
    
    for mode in continent_mode_dist.columns:
        continent_mode_prob = dict(zip(continent_mode_dist.index, continent_mode_dist[mode]))
        df_enhanced[f'continent_mode_prob_{mode}'] = df_enhanced['Continent'].map(continent_mode_prob)
    
    # 4. INTERACTION FEATURES
    # User-Attraction compatibility scores
    user_attraction_compatibility = df.groupby(['UserId', 'AttractionType'])['Rating'].mean().reset_index()
    df_enhanced = df_enhanced.merge(
        user_attraction_compatibility.rename(columns={'Rating': 'user_attraction_compatibility'}),
        on=['UserId', 'AttractionType'],
        how='left'
    )
    
    # 5. BEHAVIORAL SEQUENCES
    # Previous visit mode
    df_enhanced = df_enhanced.sort_values(['UserId', 'VisitYear', 'VisitMonth'])
    df_enhanced['prev_visit_mode'] = df_enhanced.groupby('UserId')['VisitMode'].shift(1)
    
    # 6. ADVANCED AGGREGATIONS
    # User travel diversity score
    df_enhanced['user_travel_diversity'] = df_enhanced.groupby('UserId')['VisitMode'].transform('nunique')
    df_enhanced['user_attraction_diversity'] = df_enhanced.groupby('UserId')['AttractionType'].transform('nunique')
    
    # Fill missing values
    numeric_columns = df_enhanced.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        df_enhanced[col] = df_enhanced[col].fillna(df_enhanced[col].mean())
    
    return df_enhanced



In [4]:
# Apply enhanced feature engineering
df_enhanced = create_advanced_features(data)    

In [5]:
df_enhanced.columns

Index(['TransactionId', 'UserId', 'VisitYear', 'VisitMonth', 'AttractionId',
       'Rating', 'RegionId', 'UserCountryId', 'User_City_Id',
       'AttractionTypeId', 'Attraction', 'AttractionAddress', 'VisitModeId',
       'VisitMode', 'AttractionType', 'Continent', 'Region', 'ContinentID',
       'Country', 'Country_RegionId', 'CityId', 'CityName',
       'AttractionCountryId', 'Missing_City_ID', 'VisitSeason', 'VisitQuarter',
       'user_previous_visits', 'user_avg_rating_before', 'user_rating_trend',
       'attraction_previous_visits', 'attraction_avg_rating_before',
       'attraction_previous_visitors', 'city_popularity', 'user_continent',
       'user_attraction_type', 'attraction_type_season',
       'VisitSeason_preferred', 'user_pct_Business', 'user_pct_Couples',
       'user_pct_Family', 'user_pct_Friends', 'user_pct_Solo',
       'month_mode_prob_Business', 'month_mode_prob_Couples',
       'month_mode_prob_Family', 'month_mode_prob_Friends',
       'month_mode_prob_Solo',

In [6]:
# Calculate month-mode probabilities
monthly_mode_dist = pd.crosstab(df_enhanced['VisitMonth'], df_enhanced['VisitMode'], normalize='index')

# Convert to nested dict: {mode: {month: prob, ...}, ...}
month_mode_probs = {mode: monthly_mode_dist[mode].to_dict() for mode in monthly_mode_dist.columns}

In [None]:
# Calculate continent-mode probabilities
continent_mode_dist = pd.crosstab(df_enhanced['Continent'], df_enhanced['VisitMode'], normalize='index')

# Convert to nested dict: {mode: {continent: prob, ...}, ...}
continent_mode_probs = {mode: continent_mode_dist[mode].to_dict() for mode in continent_mode_dist.columns}

In [8]:
# Define features
numerical_features = [
    'VisitMonth', 'VisitQuarter', 'VisitYear',
    'continent_mode_prob_Business', 'continent_mode_prob_Couples',
    'user_pct_Couples', 'user_pct_Family', 'user_pct_Friends', 'user_pct_Business',
    'user_travel_diversity', 'attraction_avg_rating_before', 'user_previous_visits',
    'city_popularity', 'user_avg_rating_before', 'user_attraction_compatibility',
    'sin_month', 'cos_month', 'month_mode_prob_Business', 'month_mode_prob_Family',
    'month_mode_prob_Friends', 'month_mode_prob_Couples'
]
categorical_features = [
    'VisitSeason', 'Continent', 'Region', 'Country', 'CityName',
    'AttractionType', 'prev_visit_mode'
]

# Define target variable
target = 'VisitMode'

In [9]:
# Preprocess categorical features: handle NaN and group rare categories
for col in categorical_features:
    # Replace NaN with 'Unknown'
    df_enhanced[col] = df_enhanced[col].fillna('Unknown')
    # Group rare categories into 'Other'
    value_counts = df_enhanced[col].value_counts()
    rare_categories = value_counts[value_counts < 5].index
    df_enhanced[col] = df_enhanced[col].apply(lambda x: 'Other' if x in rare_categories else x)

print("Unique prev_visit_mode values after cleaning:", df_enhanced['prev_visit_mode'].unique())

Unique prev_visit_mode values after cleaning: ['Unknown' 'Friends' 'Couples' 'Family' 'Solo' 'Business']


In [10]:
# Split the data into training and test sets (80% train, 20% test, stratified)
X = df_enhanced[numerical_features + categorical_features]
y = df_enhanced[target]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
# Initialize preprocessors
scaler = StandardScaler ()
encoders = {}

# Encode categorical features
for col in categorical_features:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    encoders[col] = le

# Scale numerical features
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Encode target variable
target_encoder = LabelEncoder()
y_train_encoded = target_encoder.fit_transform(y_train)
y_test_encoded = target_encoder.transform(y_test)

encoded = target_encoder.transform(y_test)

In [12]:
# Define all features for the model
all_features = numerical_features + categorical_features

In [13]:
# Train the model
model = RandomForestClassifier(n_estimators=500,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1)
model.fit(X_train[all_features], y_train_encoded)

# Predict from the model
y_pred_encoded = model.predict(X_test[all_features])


In [14]:
# Evaluating the model
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted')
recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted')
f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted')

print("\nRandom Forest Model Performance")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Random Forest Model Performance
Accuracy: 0.9255
Precision: 0.9266
Recall: 0.9255
F1-Score: 0.9258


In [15]:
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_encoded, target_names=target_encoder.classes_))


Classification Report:
              precision    recall  f1-score   support

    Business       0.70      0.92      0.80       125
     Couples       0.96      0.93      0.94      4324
      Family       0.93      0.94      0.93      3043
     Friends       0.90      0.91      0.91      2189
        Solo       0.88      0.88      0.88       905

    accuracy                           0.93     10586
   macro avg       0.87      0.92      0.89     10586
weighted avg       0.93      0.93      0.93     10586



In [16]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_encoded, y_pred_encoded))


Confusion Matrix:
[[ 115    0    5    3    2]
 [  11 4027  132  114   40]
 [  17   78 2868   55   25]
 [  17   67   68 1993   44]
 [   4   42   20   45  794]]


In [17]:
# Saving model
visit_mode_model = {
    'model': model,
    'scaler': scaler,
    'encoders': encoders,
    'features': all_features,
    'target_encoder': target_encoder,
    'month_mode_probs': month_mode_probs,
    'continent_mode_probs': continent_mode_probs
}

file_path = "C:/Users/Saima Modak/Capstone Projects/Tourism Analysis/Models/Visit Mode Predictor.pkl"

# Save to the specified location
with open(file_path, 'wb') as f:
    pickle.dump(visit_mode_model, f)

print(f"Model saved to: {file_path}")

Model saved to: C:/Users/Saima Modak/Capstone Projects/Tourism Analysis/Models/Visit Mode Predictor.pkl
