In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading dataset
data = pd.read_excel('C:/Users/Saima Modak/Capstone Projects/Tourism Analysis/Datasets/Final Dataset.xlsx')

In [3]:
data.head()

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,AttractionId,Rating,RegionId,UserCountryId,User_City_Id,AttractionTypeId,...,user_previous_visits,user_avg_rating_before,user_rating_trend,attraction_previous_visits,attraction_avg_rating_before,attraction_previous_visitors,city_popularity,user_continent,user_attraction_type,attraction_type_season
0,147257,59,2016,7,369,3,20,150,692,13,...,0,3.0,,0,3.0,3,50,59_Europe,59_13,13_Summer
1,146773,90,2016,10,369,5,12,85,2558,13,...,2,5.0,0.0,1,3.0,3,3,90_Asia,90_13,13_Fall
2,144692,98,2017,10,369,3,14,106,3551,13,...,0,3.0,,2,4.0,3,2765,98_Asia,98_13,13_Fall
3,146026,103,2017,4,369,5,4,22,106,13,...,2,4.5,0.0,3,3.666667,3,114,103_Africa,103_13,13_Spring
4,147748,149,2016,3,369,3,15,109,4616,13,...,0,3.0,,4,4.0,3,74,149_Australia & Oceania,149_13,13_Spring


In [4]:
# Feature columns to use
features = [
    'VisitMonth', 'VisitQuarter', 'VisitMode', 'AttractionType', 
    'Continent', 'Region', 'Country', 'VisitSeason',
    'user_previous_visits', 'user_avg_rating_before', 'user_rating_trend',
    'attraction_previous_visits', 'attraction_avg_rating_before', 
    'attraction_previous_visitors', 'city_popularity'
]
target = 'Rating'

In [None]:
# Dropping missing values in selected features, if any
df_clean = data.dropna(subset = features + [target])

In [6]:
# Splitting data into train and test set
X = df_clean[features]
y = df_clean[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Training set shape: (15520, 15)
Test set shape: (3880, 15)


In [None]:
# Encoding categorical variables
encoders = {}
categorical_cols = ['VisitMode', 'AttractionType', 'Continent', 'Region', 'Country', 'VisitSeason']

# Creating encoded copies of the datasets
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in categorical_cols:
    le = LabelEncoder()

    # Concatenate all unique values from both train and test to ensure all categories are seen
    all_values = pd.concat([X_train[col], X_test[col]]).unique()
    le.fit(all_values)

    # Transform both train and test
    X_train_encoded[col] = le.transform(X_train[col])
    X_test_encoded[col] = le.transform(X_test[col])
    
    encoders[col] = le  

In [None]:
# Scaling numerical features
numerical_features = [
    'user_rating_trend',
    'user_avg_rating_before',
    'attraction_avg_rating_before',
    'attraction_previous_visits',
    'user_previous_visits',
    'city_popularity',
    'VisitMonth',
    'attraction_previous_visitors',
    'VisitQuarter'
]

scaler = StandardScaler()

# Fit on training data only, then transform both train and test
X_train_encoded[numerical_features] = scaler.fit_transform(X_train_encoded[numerical_features])
X_test_encoded[numerical_features] = scaler.transform(X_test_encoded[numerical_features])

In [9]:
# Feature importance analysis
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_encoded, y_train)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\nFeature Importance (Top 10):")
print(feature_importance.head(15))


Feature Importance (Top 10):
                         feature  importance
10             user_rating_trend    0.588687
9         user_avg_rating_before    0.289025
12  attraction_avg_rating_before    0.024649
11    attraction_previous_visits    0.020147
8           user_previous_visits    0.016432
14               city_popularity    0.015386
0                     VisitMonth    0.007688
6                        Country    0.007524
2                      VisitMode    0.005528
3                 AttractionType    0.005386
5                         Region    0.004983
13  attraction_previous_visitors    0.004855
7                    VisitSeason    0.003882
4                      Continent    0.003748
1                   VisitQuarter    0.002080


In [10]:
# Important features for the model
important_features = [
    'user_rating_trend',
    'user_avg_rating_before',
    'attraction_avg_rating_before',
    'attraction_previous_visits',
    'user_previous_visits',
    'city_popularity',
    'VisitMonth',
    'Country',
    'VisitMode',
    'AttractionType',
    'Region',
    'attraction_previous_visitors',
    'VisitSeason',
    'Continent',
    'VisitQuarter'
]

In [11]:
X_train_important = X_train_encoded[important_features]
X_test_important = X_test_encoded[important_features]

In [None]:
# Training optimized model
model = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5,
        min_samples_leaf=2, random_state=42, n_jobs=-1)

In [13]:
# Train the model
model.fit(X_train_important, y_train)

In [None]:
# Making predictions
y_pred = model.predict(X_test_important)

In [15]:
# Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Absolute Error (MAE): 0.1589
Mean Squared Error (MSE): 0.1197
Root Mean Squared Error (RMSE): 0.3460
R² Score: 0.8772


In [None]:
# Saving rating prediction model
rating_model_data = {
    'model': model,  
    'scaler': scaler,
    'encoders': encoders,
    'target_encoder': None,  
    'features': important_features,  
    'numerical_features': numerical_features  
}

# Define your desired file path
file_path = "C:/Users/Saima Modak/Capstone Projects/Tourism Analysis/Models/Rating Predictor.pkl"  

# Save to the specified location
with open(file_path, 'wb') as f:
    pickle.dump(rating_model_data, f)

print(f"Rating prediction model saved to: {file_path}")

Rating prediction model saved to: C:/Users/Saima Modak/Capstone Projects/Tourism Analysis/Models/Rating Predictor.pkl
