In [None]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lime.lime_tabular import LimeTabularExplainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:
hotels=pd.read_csv("C:/ACL/archive/hotels.csv")
reviews=pd.read_csv("C:/ACL/archive/reviews.csv")
users=pd.read_csv("C:/ACL/archive/users.csv")

In [None]:
print(f"Hotels duplicates: {hotels.duplicated().sum()}")
print(f"Reviews duplicates: {reviews.duplicated().sum()}")
print(f"Users duplicates: {users.duplicated().sum()}")
print("--------------------------------")
print(f"Hotels nuls: {hotels.isnull().sum()}")
print("--------------------------------")
print(f"Reviews nuls: {reviews.isnull().sum()}")
print("--------------------------------")
print(f"Users nuls: {users.isnull().sum()}")

In [None]:
reviews.info()
print("--------------------------------")   
hotels.info()
print("--------------------------------")
users.info()
print("--------------------------------")

This code merges the reviews, hotels, and users datasets into one DataFrame (df) by performing left joins on the hotel_id and user_id columns to combine review, hotel, and user information.

In [None]:
review_hotel_df=reviews.merge(hotels,on='hotel_id',how='left')
df=review_hotel_df.merge(users,on='user_id',how='left')

After merging the hotels and users tables, the duplicate "country" columns were renamed to improve clarity: country_x (from hotels) became hotel_country, and country_y (from users) became user_country.

In [None]:
df.rename(columns={"country_x":"hotel_country","country_y":"user_country"},inplace=True)

The code assigns users' countries to geographic regions using a dictionary of regions and their countries. It adds a 'country_group' column with apply(), labeling unmatched countries as 'Other'

In [None]:
groups={'North_America':['United States','Canada'],
        'Western_Europe':['Germany','France','United Kingdom','Netherlands','Spain','Italy'],
        'Eastern_Europe':['Russia'],
        'East_Asia':['China','Japan','South Korea'],
        'Southeast_Asia':['Thailand','Singapore'],
        'Middle_East':['United Arab Emirates','Turkey'],
        'Africa':['Egypt','Nigeria','South Africa'],
        'Oceania':['Australia','New Zealand'],
        'South_America':['Brazil','Argentina'],
        'South_Asia':['India'],
        'North_America_Mexico':['Mexico']}

df["country_group"]=df["user_country"].apply(lambda x: next((key for key, value in groups.items() if x in value), "Other"))

df[["hotel_country","user_country","country_group"]]

In [None]:
city_scores = df.groupby(['traveller_type', 'city'])['score_overall'].mean().reset_index().sort_values(['traveller_type', 'score_overall'], ascending=[True,False])

best_cities = city_scores.groupby('traveller_type').head(1)

# display(city_scores)


In [None]:

# OPTION 1: Grouped Bar Chart (One subplot per traveller type) - RECOMMENDED
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.flatten()

traveller_types = city_scores['traveller_type'].unique()

for idx, traveller in enumerate(traveller_types):
    # Filter data for this traveller type
    data = city_scores[city_scores['traveller_type'] == traveller]
    best_city = best_cities[best_cities['traveller_type'] == traveller]['city'].values[0]
    
    # Create colors (highlight best city in green)
    colors = ['#2ECC71' if city == best_city else '#3498DB' for city in data['city']]
    
    # Plot horizontal bar chart
    axes[idx].barh(data['city'], data['score_overall'], color=colors, edgecolor='black', linewidth=1)
    axes[idx].set_xlabel('Average Overall Score', fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('City', fontsize=11, fontweight='bold')
    axes[idx].set_title(f'{traveller} Travellers', fontsize=13, fontweight='bold')
    axes[idx].set_xlim(0, 10)
    axes[idx].grid(axis='x', alpha=0.3, linestyle='--')
    
    # Add score labels on bars
    for i, (city, score) in enumerate(zip(data['city'], data['score_overall'])):
        axes[idx].text(score + 0.1, i, f'{score:.2f}', va='center', fontsize=9, fontweight='bold')
    
    # Add best city annotation
    best_score = best_cities[best_cities['traveller_type'] == traveller]['score_overall'].values[0]
    axes[idx].text(0.98, 0.98, f'🏆 Best: {best_city}\nScore: {best_score:.2f}',
                  transform=axes[idx].transAxes, fontsize=10, ha='right', va='top',
                  bbox=dict(boxstyle='round', facecolor='#2ECC71', alpha=0.3, edgecolor='black'))

plt.suptitle('Best City for Each Traveller Type Based on Overall Score', 
             fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(14, 8))

# Create position for bars
traveller_list = []
city_list = []
score_list = []
color_list = []

for traveller in traveller_types:
    data = city_scores[city_scores['traveller_type'] == traveller]
    best_city = best_cities[best_cities['traveller_type'] == traveller]['city'].values[0]
    
    for _, row in data.iterrows():
        traveller_list.append(traveller)
        city_list.append(row['city'])
        score_list.append(row['score_overall'])
        color_list.append('#2ECC71' if row['city'] == best_city else '#95A5A6')

# Create grouped bar chart
x_labels = [f"{t}\n{c}" for t, c in zip(traveller_list, city_list)]
x_pos = range(len(x_labels))

plt.bar(x_pos, score_list, color=color_list, edgecolor='black', linewidth=1)
plt.xlabel('Traveller Type - City', fontsize=12, fontweight='bold')
plt.ylabel('Average Overall Score', fontsize=12, fontweight='bold')
plt.title('Overall Scores by Traveller Type and City\n(Green = Best City for Each Traveller Type)', 
          fontsize=14, fontweight='bold', pad=20)
plt.xticks(x_pos, x_labels, rotation=45, ha='right', fontsize=8)
plt.ylim(0, 10)
plt.grid(axis='y', alpha=0.3, linestyle='--')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 7))

# Define colors for each traveller type
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A']

# Create bar chart
bars = plt.bar(best_cities['traveller_type'], 
               best_cities['score_overall'], 
               color=colors, 
               edgecolor='black', 
               linewidth=1.5, 
               alpha=0.85,
               width=0.6)

# Customize plot
plt.xlabel('Traveller Type', fontsize=13, fontweight='bold')
plt.ylabel('Average Overall Score', fontsize=13, fontweight='bold')
plt.title('Best City Recommendation for Each Traveller Type', 
          fontsize=15, fontweight='bold', pad=20)
plt.ylim(0, 10)
plt.grid(axis='y', alpha=0.3, linestyle='--')

# Add city names and scores on top of each bar
for i, (traveller, city, score) in enumerate(zip(best_cities['traveller_type'], 
                                                   best_cities['city'], 
                                                   best_cities['score_overall'])):
    plt.text(i, score + 0.15, f'🏆 {city}', 
             ha='center', va='bottom', fontsize=11, fontweight='bold')
    plt.text(i, score - 0.4, f'{score:.2f}', 
             ha='center', va='top', fontsize=10, fontweight='bold', color='white')

plt.xticks(rotation=0, fontsize=11)
plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("📊 BEST CITY RECOMMENDATIONS")
print("="*60)
for _, row in best_cities.iterrows():
    print(f"{row['traveller_type']:15} → {row['city']:20} (Score: {row['score_overall']:.2f})")

In [None]:
df = pd.get_dummies(df, columns=['traveller_type'], drop_first=True)
df = pd.get_dummies(df, columns=['city'], drop_first=True)
df = pd.get_dummies(df, columns=['user_gender'], drop_first=True)

age_order = {
    '18-24': 1,
    '25-34': 2,
    '35-44': 3,
    '45-54': 4,
    '55+': 5
}

df['age'] = df['age_group'].map(age_order)
df.drop(columns=['age_group'], inplace=True)

In [None]:

from typing import final


columns_to_drop = [
    'review_id',         
    'user_id',           
    'hotel_id',          
    'review_date',      
    'join_date',          
    'user_country',     
    'review_text',       
    'hotel_name',        
    'hotel_country',      
    'lat',                
    'lon'  
]
final_df=df.drop(columns=columns_to_drop)

# final_df.to_csv('final_dataset.csv', index=False)   
final_df.head()


In [None]:
final_df.isnull().sum()

In [None]:
final_df.isnull().sum()
final_df.rename(columns={"age_encoded":"age","ci":"user_country"},inplace=True)

In [None]:
final_df.to_csv('final_dataset.csv', index=False)

final_df is the cleaned dataset 

In [None]:
X = final_df[['score_overall','score_cleanliness','score_comfort','score_facilities','score_location','score_staff','score_value_for_money','star_rating','cleanliness_base','comfort_base','facilities_base','location_base','staff_base','value_for_money_base','traveller_type_Couple','traveller_type_Family','traveller_type_Solo','city_Bangkok','city_Barcelona','city_Berlin','city_Buenos Aires','city_Cairo','city_Cape Town','city_Dubai','city_Istanbul','city_Lagos','city_London','city_Mexico City','city_Moscow','city_Mumbai','city_New York','city_Paris','city_Rio de Janeiro','city_Rome','city_Seoul','city_Shanghai','city_Singapore','city_Sydney','city_Tokyo','city_Toronto','city_Wellington','user_gender_Male','user_gender_Other','age' ]] 
y = final_df['country_group']

In [None]:
y.value_counts().plot(kind='bar', title='Country Group Distribution')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
log_model = LogisticRegression(
    max_iter=1000,
)

In [None]:
log_model.fit(X_train,y_train)

In [None]:
y_pred = log_model.predict(X_test)

print("=== Logistic Regression Evaluation ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("\nDetailed Report:\n", classification_report(y_test, y_pred))

In [None]:
final_df.info()

In [None]:
# global explanation
explainer = shap.LinearExplainer(log_model, X_train)

shap_values = explainer.shap_values(X_test)

if isinstance(shap_values, list):
    shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0]


shap_values = np.array(shap_values, dtype=np.float64)


shap.summary_plot(shap_values, X_test, plot_type="bar", max_display=X_test.shape[1])
shap.summary_plot(shap_values, X_test, show=True)

In [None]:
# shap.initjs()

# # Use only a sample of X_train to make SHAP faster
# X_train_sample = X_train.sample(300, random_state=42)

# # Initialize SHAP LinearExplainer for Logistic Regression
# explainer = shap.LinearExplainer(log_model, X_train_sample)

# # Compute SHAP values
# shap_values = explainer.shap_values(X_test)

# # Handle cases where shap_values is a list (multi-class models)
# if isinstance(shap_values, list):
#     shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0]

# # Convert to numpy array safely
# shap_values = np.array(shap_values, dtype=np.float64)

# # Pick one test example (e.g., index 5)
# sample_idx = 5

# # Select that row
# X_sample = X_test.iloc[[sample_idx]]

# # Compute SHAP values for this instance
# shap_values_sample = explainer.shap_values(X_sample)

# # Handle multi-class
# if isinstance(shap_values_sample, list):
#     shap_values_sample = shap_values_sample[1] if len(shap_values_sample) > 1 else shap_values_sample[0]

# # Local force plot for one prediction
# shap.force_plot(
#     explainer.expected_value, 
#     shap_values_sample, 
#     X_sample
# )

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Ensure consistent random state
np.random.seed(42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# ---- Create LIME Explainer ----
explainer = LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    class_names=log_model.classes_.astype(str),
    mode='classification',
    discretize_continuous=True
)

# ---- Pick one instance ----
sample_idx = 39
sample = X_test.iloc[sample_idx]

# Get model prediction for this instance
pred_label = log_model.predict(sample.values.reshape(1, -1))[0]      # e.g. "Western_Europe"
pred_class_idx = np.where(log_model.classes_ == pred_label)[0][0]    # convert to numeric index
pred_class_name = log_model.classes_[pred_class_idx]

print(f"Predicted class: {pred_class_name}")

# ---- Explain ONLY the predicted class ----
exp = explainer.explain_instance(
    data_row=sample,
    predict_fn=log_model.predict_proba, 
    num_features=10,
    labels=[pred_class_idx]   # ✅ use numeric index
)

# ---- Visualize ----
exp.show_in_notebook(show_table=True, labels=[pred_class_idx])

# ---- Text summary ----
print(f"\nLIME Explanation for predicted class '{pred_class_name}':")
for feature, weight in exp.as_list(label=pred_class_idx):
    print(f"{feature}: {weight:.4f}")

In [None]:
# Encode string labels into integers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot encode for NN output layer
y_categorical = to_categorical(y_encoded)

In [None]:
print("y_encoded shape:", y_encoded.shape)
print("y_categorical shape:", y_categorical.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = Sequential([
    Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')  # number of country groups
])

In [None]:
#test model(no change)
# num_features = X_train_scaled.shape[1]
# num_classes = y_train.shape[1]  

# model = Sequential([
#     Dense(128, activation='relu', input_shape=(num_features,)),
#     Dropout(0.3),
#     Dense(64, activation='relu'),
#     Dropout(0.3),
#     Dense(num_classes, activation='softmax')
# ])


In [None]:
print("X_train_scaled shape:", X_train_scaled.shape)
print("y_train shape:", y_train.shape)

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_scaled, y_train,
                    epochs=50, batch_size=32,
                    validation_split=0.2, verbose=1)

y_pred_prob = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

print("=== Neural Network Evaluation ===")
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, average='weighted'))
print("Recall:", recall_score(y_true, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))
print("\nDetailed Report:\n", classification_report(y_true, y_pred, target_names=le.classes_))


In [None]:
plt.figure(figsize=(10,5))
plt.plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
X_sample = X_test_scaled[:300]

In [None]:
# Define a prediction function that outputs probabilities
def predict_fn(data):
    return model.predict(data, verbose=0)

# Initialize SHAP KernelExplainer
background = shap.sample(X_train_scaled, 100, random_state=42)
explainer = shap.KernelExplainer(predict_fn, background)

In [None]:
# Compute SHAP values for a smaller batch
shap_values = explainer.shap_values(background[:100])

In [None]:
# Create a SHAP summary plot for global feature importance
shap.summary_plot(
    shap_values,
    X_sample[:100],
    feature_names=X.columns,
    plot_type="bar",
    show=False
)

plt.title("Global Feature Importance (SHAP Summary)", fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
shap.summary_plot(
    shap_values,
    X_sample[:100],
    feature_names=X.columns
)