In [14]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lime import lime_tabular
from lime.lime_tabular import LimeTabularExplainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [15]:
hotels=pd.read_csv("C:/Users/abdel/ACL/archive/hotels.csv")
reviews=pd.read_csv("C:/Users/abdel/ACL/archive/reviews.csv")
users=pd.read_csv("C:/Users/abdel/ACL/archive/users.csv")
hotels = hotels.rename(columns={'country': 'hotel_country'})
users = users.rename(columns={'country': 'user_country'})

In [16]:
review_hotel_df=reviews.merge(hotels,on='hotel_id',how='left')
df=review_hotel_df.merge(users,on='user_id',how='left')

In [17]:
print(f"Total rows in df (should be ~50,000): {len(df)}")
print("\nNull values *after* merge:")
print(df[['user_gender', 'age_group', 'traveller_type']].isnull().sum())

Total rows in df (should be ~50,000): 50000

Null values *after* merge:
user_gender       0
age_group         0
traveller_type    0
dtype: int64


In [18]:
groups={'North_America':['United States','Canada'],
        'Western_Europe':['Germany','France','United Kingdom','Netherlands','Spain','Italy'],
        'Eastern_Europe':['Russia'],
        'East_Asia':['China','Japan','South Korea'],
        'Southeast_Asia':['Thailand','Singapore'],
        'Middle_East':['United Arab Emirates','Turkey'],
        'Africa':['Egypt','Nigeria','South Africa'],
        'Oceania':['Australia','New Zealand'],
        'South_America':['Brazil','Argentina'],
        'South_Asia':['India'],
        'North_America_Mexico':['Mexico']}

df["country_group"]=df["hotel_country"].apply(lambda x: next((key for key, value in groups.items() if x in value), "Other"))

df[["hotel_country","user_country","country_group"]]

Unnamed: 0,hotel_country,user_country,country_group
0,United States,New Zealand,North_America
1,Japan,United Kingdom,East_Asia
2,Egypt,Spain,Africa
3,Spain,United Kingdom,Western_Europe
4,Russia,Australia,Eastern_Europe
...,...,...,...
49995,Nigeria,Mexico,Africa
49996,Brazil,New Zealand,South_America
49997,Singapore,Russia,Southeast_Asia
49998,Turkey,United Kingdom,Middle_East


In [19]:
df = pd.get_dummies(df, columns=['traveller_type'], drop_first=True)
df = pd.get_dummies(df, columns=['user_country'], drop_first=True)
df = pd.get_dummies(df, columns=['user_gender'], drop_first=True)

age_order = {
    '18-24': 1,
    '25-34': 2,
    '35-44': 3,
    '45-54': 4,
    '55+': 5
}

df['age'] = df['age_group'].map(age_order)
df.drop(columns=['age_group'], inplace=True)

In [20]:

df['diff_cleanliness'] = df['score_cleanliness'] - df['cleanliness_base']
df['diff_comfort'] = df['score_comfort'] - df['comfort_base']
df['diff_facilities'] = df['score_facilities'] - df['facilities_base']
df['diff_overall_vs_star'] = df['score_overall'] - df['star_rating']
df['diff_location'] = df['score_location'] - df['location_base']
df['diff_staff'] = df['score_staff'] - df['staff_base']
df['diff_value_for_money'] = df['score_value_for_money'] - df['value_for_money_base']

print("Successfully created all 'diff' features. ✅")
print("\nHead of the new 'diff' features:")
print(df[[ 'diff_cleanliness', 'diff_comfort', 'diff_facilities','diff_overall_vs_star', 'diff_location', 'diff_staff', 'diff_value_for_money']].head())

Successfully created all 'diff' features. ✅

Head of the new 'diff' features:
   diff_cleanliness  diff_comfort  diff_facilities  diff_overall_vs_star  \
0              -0.5          -0.1             -0.4                   3.7   
1               0.4           0.1             -0.3                   4.1   
2               0.9           0.1             -0.3                   3.8   
3              -0.2          -0.3             -0.4                   3.9   
4              -0.2           0.2              0.3                   4.1   

   diff_location  diff_staff  diff_value_for_money  
0           -0.5         0.2                   0.7  
1            0.1        -0.1                   0.4  
2           -0.4        -0.6                   0.2  
3           -0.1         0.1                  -0.5  
4           -0.7         0.3                   0.3  


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 63 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   review_id                          50000 non-null  int64  
 1   user_id                            50000 non-null  int64  
 2   hotel_id                           50000 non-null  int64  
 3   review_date                        50000 non-null  object 
 4   score_overall                      50000 non-null  float64
 5   score_cleanliness                  50000 non-null  float64
 6   score_comfort                      50000 non-null  float64
 7   score_facilities                   50000 non-null  float64
 8   score_location                     50000 non-null  float64
 9   score_staff                        50000 non-null  float64
 10  score_value_for_money              50000 non-null  float64
 11  review_text                        50000 non-null  obj

In [22]:
X = df[[
    'diff_overall_vs_star',
    'user_country_Australia',
    'user_country_Brazil',
    'user_country_Canada',
    'user_country_China',
    'user_country_Egypt',
    'user_country_France',
    'user_country_Germany',
    'user_country_India',
    'user_country_Italy',
    'user_country_Japan',
    'user_country_Mexico',
    'user_country_Netherlands',
    'user_country_New Zealand',
    'user_country_Nigeria',
    'user_country_Russia',
    'user_country_Singapore',
    'user_country_South Africa',
    'user_country_South Korea',
    'user_country_Spain',
    'user_country_Thailand',
    'user_country_Turkey',
    'user_country_United Arab Emirates',
    'user_country_United Kingdom',
    'user_country_United States',
    'diff_cleanliness',
    'diff_comfort',
    'diff_facilities',
    'diff_location',
    'diff_staff',
    'diff_value_for_money',
    'traveller_type_Couple',
    'traveller_type_Family',
    'traveller_type_Solo',
    'user_gender_Male',
    'user_gender_Other',
    'age'
]]
y = df['country_group']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
log_model = LogisticRegression(
    max_iter=100,
)

In [25]:
log_model.fit(X_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [26]:
y_pred = log_model.predict(X_test)

print("=== Logistic Regression Evaluation ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("\nDetailed Report:\n", classification_report(y_test, y_pred))

=== Logistic Regression Evaluation ===
Accuracy: 0.3201
Precision: 0.24718676472060278
Recall: 0.3201
F1 Score: 0.23324576465275773

Detailed Report:
                       precision    recall  f1-score   support

              Africa       0.33      0.63      0.43      1247
           East_Asia       0.30      0.28      0.29      1195
      Eastern_Europe       0.00      0.00      0.00       358
         Middle_East       0.15      0.01      0.02       816
       North_America       0.15      0.02      0.03       776
North_America_Mexico       0.06      0.03      0.04       400
             Oceania       0.19      0.00      0.01       805
       South_America       0.39      0.14      0.21       813
          South_Asia       0.12      0.01      0.01       389
      Southeast_Asia       0.12      0.06      0.08       818
      Western_Europe       0.35      0.79      0.48      2383

            accuracy                           0.32     10000
           macro avg       0.20      0.18

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [27]:
# Encode string labels into integers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot encode for NN output layer
y_categorical = to_categorical(y_encoded)

In [28]:
print("y_encoded shape:", y_encoded.shape)
print("y_categorical shape:", y_categorical.shape)

y_encoded shape: (50000,)
y_categorical shape: (50000, 11)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)

In [30]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
model = Sequential([
    Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')  
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [32]:
print("X_train_scaled shape:", X_train_scaled.shape)
print("y_train shape:", y_train.shape)
print("X_train_scaled shape:", X_train_scaled.shape)
print("y_train shape:", y_train.shape)

X_train_scaled shape: (40000, 37)
y_train shape: (40000, 11)
X_train_scaled shape: (40000, 37)
y_train shape: (40000, 11)


In [33]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_scaled, y_train,
                    epochs=50, batch_size=32,
                    validation_split=0.2, verbose=1)

y_pred_prob = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

print("=== Neural Network Evaluation ===")
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, average='weighted'))
print("Recall:", recall_score(y_true, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))
print("\nDetailed Report:\n", classification_report(y_true, y_pred, target_names=le.classes_))

Epoch 1/50
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.2851 - loss: 2.0566 - val_accuracy: 0.3256 - val_loss: 1.8138
Epoch 2/50
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.3514 - loss: 1.7738 - val_accuracy: 0.3746 - val_loss: 1.6715
Epoch 3/50
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3713 - loss: 1.6672 - val_accuracy: 0.3846 - val_loss: 1.5921
Epoch 4/50
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3833 - loss: 1.5984 - val_accuracy: 0.3886 - val_loss: 1.5251
Epoch 5/50
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3906 - loss: 1.5499 - val_accuracy: 0.4024 - val_loss: 1.4774
Epoch 6/50
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.3972 - loss: 1.5101 - val_accuracy: 0.4056 - val_loss: 1.4544
Epoch 7/50
[1m1