In [None]:
#HOTELS.CSV 
import numpy as np # linear algebra 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 
hotels = pd.read_csv("/kaggle/input/international-hotel-booking-analytics/hotels.csv") 
# Step 3: View basic info 
print("Shape:", hotels.shape) 
print("\n--- Columns ---\n", hotels.columns) 
print("\n--- Null Values ---\n", hotels.isnull().sum()) 
print("\n--- Duplicate Rows:", hotels.duplicated().sum()) 

columns_to_drop = ['lat', 'lon'] 
hotels.drop(columns=columns_to_drop, inplace=True) 
hotels.head() 

hotels.to_csv("cleaned_hotels.csv", index=False) #The lat&lan columns have been dropped because they are unnecessary description for the hotel

## 1.DATA CLEANING
## 1.1:Loading and Initial Cleaning of Hotels Data

We first load the `hotels.csv` dataset to inspect its structure and quality. Key steps:

- Checked the **shape** to understand the number of rows and columns.
- Listed all **column names** to see what features are available.
- Checked for **missing values** to detect incomplete data.
- Checked for **duplicate rows** to remove redundancy.

After inspection, we dropped the `lat` and `lon` columns because they are not needed for predictive modeling in this project — the exact geographic coordinates are not informative for country group predictions.

Finally, the cleaned dataset is saved as `cleaned_hotels.csv` for further analysis.


In [None]:
#REVIEWS.CSV

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


reviews = pd.read_csv('/kaggle/input/international-hotel-booking-analytics/reviews.csv')
print("Shape:", reviews.shape)
print("\n--- Columns ---\n", reviews.columns)
print("\n--- Null Values ---\n", reviews.isnull().sum())
print("\n--- Duplicate Rows:", reviews.duplicated().sum())

reviews.head()

reviews.to_csv("cleaned_reviews.csv", index=False)



## 1.2 :Loading and Initial Cleaning of Reviews Data

We load the `reviews.csv` dataset to inspect its structure and quality. Key steps:

- Checked the **shape** to understand the number of rows and columns.
- Listed all **column names** to see the types of review features available.
- Checked for **missing values** to identify incomplete reviews.
- Checked for **duplicate rows** to remove redundant entries.

After inspection, the cleaned dataset is saved as `cleaned_reviews.csv` for further analysis. At this stage, no columns are dropped because all review features may be relevant for the predictive modeling task.



In [None]:
#USERS.CSV

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


users = pd.read_csv('/kaggle/input/international-hotel-booking-analytics/users.csv')
print("Shape:", users.shape)
print("\n--- Columns ---\n", users.columns)
print("\n--- Null Values ---\n", users.isnull().sum())
print("\n--- Duplicate Rows:", users.duplicated().sum())

columns_to_drop = ['country','join_date'] 
users.drop(columns=columns_to_drop, inplace=True)

users.head()

users.to_csv("cleaned_users.csv", index=False)



#The country&join_date columns have been dropped because they are unnecessary description for the users review on the hotel.

## 1.3: Loading and Initial Cleaning of Users Data

We load the `users.csv` dataset to inspect the user information. Key steps:

- Checked the **shape** to see the number of users and available features.
- Listed all **columns** to understand user attributes.
- Checked for **missing values** and **duplicate rows** to ensure data quality.
- Dropped unnecessary columns:
  - `country`: This column is redundant because the target variable `country_group` already encodes the user’s location in groups.
  - `join_date`: This timestamp is not relevant to predicting user behavior or hotel ratings, and including it could introduce noise.

By removing irrelevant columns, we simplify the dataset, reduce dimensionality, and avoid including features that do not contribute to our predictive modeling task.

The cleaned dataset is saved as `cleaned_users.csv` for merging with hotel and review data in later steps.


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

hotels = pd.read_csv("cleaned_hotels.csv") # Provided city information for each hotel_id.
reviews = pd.read_csv("cleaned_reviews.csv")#Provided each user’s score_overall and hotel_id.
users = pd.read_csv("cleaned_users.csv") #Contained the traveler’s traveller_type (e.g., Couple, Family, Solo).

# ----------------------------------------------------------
# Q1: Which city is best for each traveler type?
# ----------------------------------------------------------
reviews_users = reviews.merge(users[['user_id', 'traveller_type']], on='user_id', how='left')
#Merged reviews with users using user_id to connect each review with its traveler type.

merged_df = reviews_users.merge(hotels[['hotel_id', 'city']], on='hotel_id', how='left')
#Merged the result with hotels using hotel_id to associate each review with the hotel’s city.

city_ratings = (
    merged_df.groupby(['traveller_type', 'city'])['score_overall']
    .mean()
    .reset_index()
    .sort_values(by=['traveller_type', 'score_overall'], ascending=[True, False])
)
#Grouped by traveller_type and city, and calculated the average overall score (score_overall).

best_city_per_type = city_ratings.loc[
    city_ratings.groupby('traveller_type')['score_overall'].idxmax()
].reset_index(drop=True)
#Selected the highest-rated city for each traveler type.

print("🏆 Best City for Each Traveler Type:")
display(best_city_per_type)

plt.figure(figsize=(10, 6))
sns.barplot(
    data=best_city_per_type,
    x='traveller_type',
    y='score_overall',
    hue='city',
    palette='viridis'
)
plt.title('Best City per Traveler Type (Average Overall Score)', fontsize=14)
plt.xlabel('Traveler Type')
plt.ylabel('Average Score')
plt.xticks(rotation=30)
plt.legend(title='City', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
#Visualized results using a bar plot (traveler type on X-axis, average score on Y-axis, colored by city).

##  2.DATA ENGINEERING QUESTIONS
##  2.1: What is the best city for each traveler type?

We aim to determine which city provides the best overall hotel experience for different types of travelers.

### Steps:

1. **Merge Datasets**  
   - `reviews` merged with `users` on `user_id` to associate each review with the traveler's type.  
   - Result further merged with `hotels` on `hotel_id` to get the city for each hotel reviewed.

2. **Compute Average Scores**  
   - Grouped by `traveller_type` and `city` and calculated the **mean `score_overall`**.  
   - Sorted the results to identify the top-rated cities for each traveler type.

3. **Select Best Cities**  
   - For each `traveller_type`, selected the city with the **highest average overall score**.

### Findings:

- The resulting table `best_city_per_type` shows the top city for each traveler type.
- The **bar plot** visually represents the highest-rated city per traveler type, making it easy to compare the average scores.

### Justification:

- This analysis is **hypothesis-driven**: we expect that different traveler types (e.g., Couple, Solo, Family) might prefer different cities due to the types of hotels, activities, or services available.
- Using **average score** as the metric ensures that the chosen city is consistently rated highly by that traveler type, rather than relying on a single review.


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

hotels = pd.read_csv("cleaned_hotels.csv")
reviews = pd.read_csv("cleaned_reviews.csv")
users = pd.read_csv("cleaned_users.csv")

# ----------------------------------------------------------
# Q2: “Top 3 countries with the best value-for-money score per traveler’s age group”
# ----------------------------------------------------------

merged = reviews.merge(hotels, on="hotel_id", how="left")
# Merge reviews with hotels to link each review to a hotel

merged = merged.merge(users, on="user_id", how="left")
# Merge users to add traveler demographics


avg_scores = (
    merged.groupby(["age_group", "country"])["score_value_for_money"]
    .mean()
    .reset_index()
)
# Compute average value-for-money score for each country and age group


top3 = (
    avg_scores.sort_values(["age_group", "score_value_for_money"], ascending=[True, False])
    .groupby("age_group")
    .head(3)
)
# Select top 3 countries for each age group


print("Top 3 countries by value-for-money score per age group:")
print(top3)

# Sort for cleaner plotting
top3_sorted = top3.sort_values(["age_group", "score_value_for_money"], ascending=[True, False])

# Set figure size and style
plt.figure(figsize=(10, 6))
sns.barplot(
    data=top3_sorted,
    x="age_group",
    y="score_value_for_money",
    hue="country",
    palette="viridis"
)
#barplot is used to show comparisons clearly between multiple categorical groups.

plt.title("Top 3 Countries with Best Value-for-Money Score per Traveler’s Age Group", fontsize=14)
plt.xlabel("Traveler Age Group")
plt.ylabel("Average Value-for-Money Score")
plt.legend(title="Country", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 2.2: Top 3 Countries with Best Value-for-Money Score per Traveler's Age Group

We aim to identify which countries provide the best value-for-money hotel experience for travelers in different age groups.

#### Steps

**1. Dataset Merging**
- Merged `reviews` with `hotels` on `hotel_id` to link each review to a hotel
- Further merged with `users` on `user_id` to include traveler demographics (age_group)

**2. Average Value-for-Money Score Calculation**
- Grouped data by `age_group` and `country`
- Calculated mean `score_value_for_money` for each combination to quantify perceived value

**3. Top Country Selection**
- Sorted grouped results by `score_value_for_money` in descending order
- Selected top 3 countries for each `age_group`

**4. Results Visualization**
- Created bar plot with:
  - X-axis: `age_group`
  - Y-axis: `score_value_for_money` 
  - Hue: `country` to show top-rated countries per age group

#### Findings

- The `top3` table displays the top three countries with highest value-for-money scores for each age group
- The bar plot provides clear visual comparison between countries within each age group



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

hotels = pd.read_csv("cleaned_hotels.csv")
reviews = pd.read_csv("cleaned_reviews.csv")
users = pd.read_csv("cleaned_users.csv")



def map_country_to_group(country):
    North_America =[   'United States', 'Canada'  ]
    Western_Europe = [
           'Germany', 'France', 'United Kingdom',  'Netherlands',
           'Spain', 'Italy'
    ]
    Eastern_Europe =  [ 'Russia' ]
    East_Asia = ['East_Asia', 'China', 'Japan','South Korea']
    Southeast_Asia =   ['Thailand' ,'Singapore' ]
    Middle_East = [ 'United Arab Emirates', 'Turkey' ]
    Africa = ['Egypt', 'Nigeria', 'South Africa']
    Oceania = ['Australia' , 'New Zealand']
    South_America = ['Brazil' , 'Argentina']
    South_Asia = ['India']
    North_America_Mexico =['Mexico']

    
    if country in North_America:
        return 'North_America'
    elif country in Western_Europe:
        return 'Western_Europe'
    elif country in Eastern_Europe:
        return 'Eastern_Europe'
    elif country in East_Asia:
        return 'East_Asia'
    elif country in Southeast_Asia:
        return 'Southeast_Asia'
    elif country in Middle_East:
        return 'Middle_East'
    elif country in Africa:
        return 'Africa'  
    elif country in Oceania:
        return 'Oceania'
    elif country in South_America:
        return 'South_America'
    elif country in South_Asia:
        return 'South_Asia'
    elif country in North_America_Mexico:
        return 'North_America_Mexico'
    else:
        return 'Other'


if 'country' in merged.columns:
    merged['country_group'] = merged['country'].map(map_country_to_group)
    print("✅ 'country_group' column created successfully!\n")
    print(merged.head(2))

## 3.PREDICTIVE MODELING 
## 3.1: Mapping Countries to Country Groups
We aim to categorize countries into broader geographic groups to reduce complexity and improve interpretability in our analysis and predictive modeling.

### Steps 

1. **Define Country Groups**

- Created lists of countries for each group, e.g., North_America, Western_Europe, East_Asia, etc.

- This grouping reflects geographic and cultural similarities that may influence traveler preferences.

2. **Create a Mapping Function**

- map_country_to_group(country) function takes a country name and returns its corresponding group.

- Countries not listed are categorized as Other.

3. **Apply Mapping to Dataset**

- Added a new column 'country_group'  in the merged dataset using the mapping function.

- This column simplifies the prediction task by converting many country labels into 10 meaningful groups.

### Findings 

- The first few rows of merged now show the new country_group column.

- Travelers are now categorized into groups such as North_America, Western_Europe, East_Asia, etc., which can be used for multi-class classification in the predictive model.



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

# Ensure 'country_group' exists
if 'country_group' not in merged.columns:
    print("⚠️ 'country_group' column not found! Please ensure it's created or derived earlier.")
else:
    # Select relevant features 
    features = ['score_overall', 'value_for_money_base', 'traveller_type', 'age_group', 'user_gender']
    X = merged[features].copy()
    y = merged['country_group']

    # Encode categorical variables
    for col in X.select_dtypes(include='object').columns:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))

    y = LabelEncoder().fit_transform(y.astype(str))

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train Random Forest model
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)

    # Encode the target variable (country_group)
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(merged['country_group'].astype(str))

    # Display the mapping
    print("\n🔢 Country Group Encoding Mapping:")
    for i, class_name in enumerate(label_encoder.classes_):
        print(f"{i} → {class_name}")


    # Evaluation
    print("\n🔍 Random Forest Model Performance:")
    print(classification_report(y_test, y_pred))


    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title("Confusion Matrix - Country Group Prediction")
    plt.show()


    # Scatter plot of predicted vs actual (for visualization)
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.xlabel("Actual Country Group")
    plt.ylabel("Predicted Country Group")
    plt.title("Actual vs Predicted Country Groups (Scatter Plot)")
    plt.show()



    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"\n📊 Model Metrics Summary:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")


## 3.2: Learning Model: Random Forest Classifier

### Objective
Develop a predictive model to recommend destination country groups based on traveler preferences, demographics, and review patterns. This enables personalized travel recommendations and targeted marketing strategies.

###  Model Overview

**Algorithm**: Random Forest Classifier  
**Target Variable**: `country_group` - Categorical classification of destination countries  
**Features Used**:
- `score_overall`: Overall satisfaction rating
- `value_for_money_base`: Perceived value assessment
- `traveller_type`: Travel type category (Solo, Couple, Family, etc.)
- `age_group`: Demographic segmentation
- `user_gender`: Gender information

### 🔧 Data Preprocessing Pipeline

**1. Feature Encoding**
- Applied Label Encoding to categorical variables (`traveller_type`, `age_group`, `user_gender`)
- Transformed text categories to numerical representations for model compatibility

**2. Feature Scaling**
- Standardized all features using `StandardScaler`
- Ensured equal contribution from all variables regardless of original scale
- Improved model convergence and performance

**3. Train-Test Split**
- 80-20 split ratio (Training: 80%, Testing: 20%)
- Random state fixed for reproducibility
- Stratified sampling to maintain class distribution

###  Model Training

**Random Forest Classifier Configuration**
- Ensemble of multiple decision trees
- Built-in feature importance analysis
- Robust to overfitting through bagging
- Handles non-linear relationships effectively

###  Performance Evaluation

**Classification Metrics**:
- **Accuracy**: Overall correct prediction rate
- **Precision**: Quality of positive predictions  
- **Recall**: Coverage of actual positive cases
- **F1-Score**: Balanced measure of precision and recall

**Visual Diagnostics**:
- **Confusion Matrix**: Detailed breakdown of prediction vs actual classes
- **Actual vs Predicted Plot**: Visual alignment check between predictions and ground truth



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Input


# Convert labels to categorical one-hot format
num_classes = len(np.unique(y))
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat = to_categorical(y_test, num_classes)

# Define the improved FFNN

model = Sequential([
    Input(shape=(X_train.shape[1],)),   # ✅ New proper input layer
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])


# Compile model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# 🧩 Add Early Stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train model (tuned epochs + batch size)
history = model.fit(X_train, y_train_cat,
                    epochs=50,              # Increased epochs for deeper learning
                    batch_size=32,          #  Smaller batches help smoother convergence
                    validation_split=0.2,
                    callbacks=[early_stop],
                    verbose=1)


## 3.3: Learning Model: Feedforward Neural Network (FFNN)

### Model Architecture

**Algorithm**: Deep Feedforward Neural Network (Multi-layer Perceptron)  
**Problem Type**: Multi-class Classification  
**Target Variable**: `country_group` (One-hot encoded)  
**Input Features**: 5 standardized features from preprocessing pipeline


**Key Architectural Decisions**:

1. **Input Layer**: Explicit input shape definition for better model stability
2. **Hidden Layers**: 
   - First hidden layer: 64 neurons with ReLU activation
   - Second hidden layer: 32 neurons with ReLU activation  
   - Progressive reduction for feature hierarchy learning

3. **Regularization**:
   - Dropout (30%) after first hidden layer
   - Dropout (20%) after second hidden layer
   - Prevents overfitting and improves generalization

4. **Output Layer**: 
   - Softmax activation for multi-class probability distribution
   - Number of neurons = unique country groups

### Model Configuration

**Compilation Parameters**:
- **Optimizer**: Adam (Adaptive Moment Estimation)
- **Loss Function**: Categorical Crossentropy
- **Evaluation Metric**: Accuracy

**Training Configuration**:
- **Epochs**: 50 (with early stopping)
- **Batch Size**: 32 (balanced convergence speed)
- **Validation Split**: 20% of training data
- **Early Stopping**: Monitors val_loss with 5-epoch patience



In [None]:
# Predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Performance report
print("\n🔍  FFNN Model Performance:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Purples')
plt.title("Confusion Matrix -  FFNN Country Group Prediction")
plt.show()

# Key metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"📊  FFNN Metrics Summary:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")


## 3.3: Learning Model: Feedforward Neural Network (FFNN)


### FFNN Model Performance Evaluation

**Prediction Pipeline**:
1. **Probability Predictions**: Raw softmax outputs for each class
2. **Class Assignment**: Argmax selection for final predictions
3. **Comprehensive Evaluation**: Multi-metric performance assessment

### Performance Metrics Analysis

**Classification Report Breakdown**:
- **Precision**: Measures prediction quality for each class
- **Recall**: Evaluates coverage of actual class members
- **F1-Score**: Harmonic mean of precision and recall
- **Support**: Number of actual occurrences for each class

**Key Performance Indicators**:
- **Overall Accuracy**: Proportion of correct predictions across all classes
- **Weighted Metrics**: Account for class imbalance in the dataset
- **Per-Class Performance**: Detailed analysis of each country group's predictability

### Visual Performance Diagnostics

**Confusion Matrix Insights**:
- **Diagonal Elements**: Correct predictions (true positives)
- **Off-Diagonal Elements**: Misclassifications and confusion patterns
- **Color Intensity**: Visual representation of prediction frequency
- **Class-wise Performance**: Identification of strong and weak prediction areas

**Pattern Analysis**:
- Which country groups are frequently confused with each other
- Systematic misclassification patterns
- Model strengths and limitations in class discrimination

In [None]:
import pandas as pd

# Model metrics
results = {
    "Model": ["Random Forest", "FFNN"],
    "Accuracy": [0.6497, 0.6614],
    "Precision": [0.6360, 0.6661],
    "Recall": [0.6497, 0.6614],
    "F1-score": [0.6361, 0.6395]
}

df_results = pd.DataFrame(results)
display(df_results)


In [None]:
import matplotlib.pyplot as plt

# Plot
df_results.set_index("Model").plot(kind="bar", figsize=(8,5))
plt.title("Model Performance Comparison: Random Forest vs FFNN", fontsize=14)
plt.ylabel("Score")
plt.ylim(0.6, 0.7)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.legend(loc="lower right")
plt.show()

best_model = df_results.loc[df_results["Accuracy"].idxmax(), "Model"]
print(f"✅ The better performing model is: {best_model}")



## 3.4: Model Performance Comparison: Random Forest vs FFNN

### Executive Summary

After comprehensive evaluation of both machine learning approaches, the **Feedforward Neural Network (FFNN)** demonstrates slightly superior performance across key metrics, establishing it as the preferred model for country group prediction.


### Key Performance Insights

**FFNN Advantages**:
- **+1.17%** improvement in Accuracy
- **+3.01%** improvement in Precision  
- **+1.17%** improvement in Recall
- **+0.34%** improvement in F1-Score

**Performance Analysis**:
- **Accuracy**: FFNN correctly predicts more overall instances
- **Precision**: FFNN shows better prediction quality (fewer false positives)
- **Recall**: FFNN captures more true positive cases
- **F1-Score**: Balanced measure favors FFNN despite smaller margin

### Model Selection Rationale

**Why FFNN Wins**:
1. **Superior Pattern Recognition**: Better capture of complex feature interactions
2. **Improved Precision**: Higher confidence in positive predictions
3. **Consistent Performance**: Balanced across all evaluation metrics
4. **Scalability Potential**: Better positioned for future data growth


### Technical Interpretation

**FFNN Performance Drivers**:
- Non-linear activation functions capturing complex relationships
- Dropout regularization preventing overfitting
- Automatic feature learning through hidden layers
- Better handling of feature interactions

**Accuracy Context**:
- Both models significantly outperform random guessing (~20% for 5 classes)
- 66% accuracy represents meaningful predictive power for travel preferences


In [None]:
# -------------------------
# Define encoders (based on training)
# -------------------------
traveller_type_encoder = LabelEncoder().fit(merged['traveller_type'])
age_group_encoder = LabelEncoder().fit(merged['age_group'])
user_gender_encoder = LabelEncoder().fit(merged['user_gender'])
country_group_encoder = LabelEncoder().fit(merged['country_group'])

# -------------------------
# Inference function
# -------------------------
def predict_country_group(raw_input):
    """
    Accepts a raw input dictionary with keys:
    'score_overall', 'value_for_money_base', 'traveller_type', 'age_group', 'user_gender'
    Returns the predicted country group as a string.
    """
    # Convert raw input to DataFrame
    df = pd.DataFrame([raw_input])
    
    # Encode categorical features exactly as during training
    df['traveller_type'] = traveller_type_encoder.transform(df['traveller_type'])
    df['age_group'] = age_group_encoder.transform(df['age_group'])
    df['user_gender'] = user_gender_encoder.transform(df['user_gender'])
    
    # Select features in the same order as training
    X = df[['score_overall', 'value_for_money_base', 'traveller_type', 'age_group', 'user_gender']]
    
    # Scale features using the same scaler
    X_scaled = scaler.transform(X)
    
    # Predict probabilities using the trained FFNN
    pred_probs = model.predict(X_scaled)
    
    # Get numeric class
    pred_class_num = np.argmax(pred_probs, axis=1)[0]
    
    # Map numeric class to human-readable label
    return country_group_encoder.inverse_transform([pred_class_num])[0]

# -------------------------
# Example inputs
# -------------------------
example_1 = {
    'score_overall': 8.5,
    'value_for_money_base': 7.8,
    'traveller_type': 'Couple',
    'age_group': '25-34',
    'user_gender': 'Male'
}

example_2 = {
    'score_overall': 6.2,
    'value_for_money_base': 6.5,
    'traveller_type': 'Solo',
    'age_group': '45-54',
    'user_gender': 'Female'
}

# -------------------------
# 4️⃣ Run inference
# -------------------------
for i, example in enumerate([example_1, example_2], 1):
    numeric_pred = np.argmax(model.predict(scaler.transform(
        pd.DataFrame([{
            'score_overall': example['score_overall'],
            'value_for_money_base': example['value_for_money_base'],
            'traveller_type': traveller_type_encoder.transform([example['traveller_type']])[0],
            'age_group': age_group_encoder.transform([example['age_group']])[0],
            'user_gender': user_gender_encoder.transform([example['user_gender']])[0]
        }]
    ))), axis=1)[0]
    
    label_pred = predict_country_group(example)
    
    print(f"\nExample #{i}:")
    print("Raw input:", example)
    print("Model numeric prediction:", numeric_pred)
    print("Inference function prediction:", label_pred)


In [None]:
filtered = merged[
    (merged['score_overall'].between(8.0, 9.0)) & 
    (merged['value_for_money_base'].between(7.5, 8.0)) &
    (merged['traveller_type'] == 'Couple') &
    (merged['age_group'] == '25-34') &
    (merged['user_gender'] == 'Male')
]

print(f"Found {len(filtered)} matching rows.")
filtered.head()


In [None]:
predicted_group = predict_country_group(example_1)
print("Predicted Country Group:", predicted_group)
print("Actual Country Groups in Filtered Data:")
print(filtered['country_group'].value_counts())


## 3.5: Inference Function for FFNN Model
### Objective 
Implement an inference function that takes raw input features for a hotel review and predicts the corresponding country group using the trained FFNN model. This function ensures that predictions are interpretable and consistent with the preprocessing performed during model training.

### Steps:

**1. Define Encoders**
- LabelEncoder objects were created for all categorical features (`traveller_type`, `age_group`, `user_gender`) and the target variable (`country_group`)
- These encoders transform categorical text data into numeric labels that the model can process
- Ensures exact same encoding scheme as training for feature alignment

**2. Inference Function**
- Accepts a dictionary with raw feature values:
  - `score_overall`
  - `value_for_money_base` 
  - `traveller_type`
  - `age_group`
  - `user_gender`
- Converts the input into a DataFrame
- Applies exact same LabelEncoder transformations as during training
- Scales the features using the StandardScaler fitted on the training set
- Feeds the processed input to the FFNN model to obtain class probabilities
- Returns the predicted country group in natural language, not just numeric labels

**3. Example Demonstration**
- Two example inputs were tested to verify that the function produces meaningful and interpretable predictions
- Both the numeric class prediction and the human-readable country group are displayed for transparency
- Validates end-to-end functionality from raw input to interpretable output

In [None]:
pip install shap lime


In [None]:
import shap
import lime
from lime.lime_tabular import LimeTabularExplainer
import numpy as np
import matplotlib.pyplot as plt


In [None]:
# Convert to numpy if not already
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)


In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt

# ✅ Wrapper for model predictions
def predict_fn(X):
    return model.predict(X)

# ✅ Convert to NumPy arrays if not already
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# ✅ Use a small background sample for SHAP (for speed)
background = shap.sample(X_train_np, 50, random_state=42)

# ✅ Create KernelExplainer (compatible with TF 2.16+)
explainer = shap.KernelExplainer(predict_fn, background)

# ✅ Compute SHAP values for a small subset (10 test samples for efficiency)
print("🔍 Calculating SHAP values (this may take a few minutes)...")
shap_values = explainer.shap_values(X_test_np[:10])

# ✅ Feature names
feature_names = list(X.columns)

# ✅ Country group mapping (your encoding)
country_groups = [
    "Africa",
    "East_Asia",
    "Eastern_Europe",
    "Middle_East",
    "North_America",
    "North_America_Mexico",
    "Oceania",
    "South_America",
    "South_Asia",
    "Southeast_Asia",
    "Western_Europe"
]

# ✅ Loop over each country group (class) and visualize its SHAP values
for i, group in enumerate(country_groups):
    print(f"\n📊 Generating SHAP plots for {group}...")
    
    # Global bar plot (feature importance)
    plt.title(f"SHAP Feature Importance - {group}", fontsize=14)
    shap.summary_plot(
        shap_values[i],
        X_test_np[:10],
        feature_names=feature_names,
        plot_type='bar',
        show=False
    )
    plt.show()
    
    # Detailed summary (beeswarm-style)
    plt.title(f"SHAP Summary Plot - {group}", fontsize=14)
    shap.summary_plot(
        shap_values[i],
        X_test_np[:10],
        feature_names=feature_names,
        show=False
    )
    plt.show()


## 4. MODEL EXPLAINABILITY  
## 4.1: Global SHAP Analysis for FFNN 

### Objective
Understand and explain the black-box FFNN model by identifying which features most influence destination country group predictions for different traveler segments.

### SHAP Implementation Strategy

**Approach**:
- **KernelExplainer**: Model-agnostic method compatible with TensorFlow 2.16+
- **Background Sampling**: 50 samples from training data for reference distribution
- **Efficient Computation**: Limited to 10 test samples for computational feasibility
- **Multi-class Analysis**: Separate SHAP values for each country group



In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt

# ✅ Wrapper for model predictions
def predict_fn(X):
    return model.predict(X)

# ✅ Convert to NumPy arrays if not already
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# ✅ Use a small background sample for SHAP (for efficiency)
background = shap.sample(X_train_np, 50, random_state=42)

# ✅ Create SHAP KernelExplainer (works with TF 2.16+)
explainer = shap.KernelExplainer(predict_fn, background)

# ✅ Compute SHAP values for a small subset
print("Calculating SHAP values (this may take a few minutes)...")
shap_values = explainer.shap_values(X_test_np[:20])  # up to 20 samples for stability

# ✅ Feature names
feature_names = list(X.columns)

# ✅ Country group mapping
country_groups = [
    "Africa",
    "East_Asia",
    "Eastern_Europe",
    "Middle_East",
    "North_America",
    "North_America_Mexico",
    "Oceania",
    "South_America",
    "South_Asia",
    "Southeast_Asia",
    "Western_Europe"
]

# ✅ Combine all SHAP values across all classes
all_shap_values = np.mean(np.abs(shap_values), axis=0)  # average absolute SHAP values across classes

# ✅ Global summary: Combined Feature Importance
plt.title("Combined SHAP Feature Importance Across All Country Groups", fontsize=14)
shap.summary_plot(
    all_shap_values,
    X_test_np[:20],
    feature_names=feature_names,
    plot_type='bar'
)

# ✅ Detailed Summary (Combined Beeswarm)
plt.title(" Combined SHAP Summary Plot - All Country Groups", fontsize=14)
shap.summary_plot(
    all_shap_values,
    X_test_np[:20],
    feature_names=feature_names
)


## 4.2: Combined SHAP Analysis for all country groups 

### Objective
Provide a holistic understanding of feature importance across all destination country groups.

### SHAP Methodology

**Approach**:
- **Expanded Sample Size**: Increased from 10 to 20 test samples for more stable results
- **Cross-Class Aggregation**: Mean absolute SHAP values across all country groups
- **Global Perspective**: Combined analysis reveals universal feature importance patterns



In [None]:
# ✅ LOCAL EXPLANATION — Force Plot for One Sample
# Choose one test example
sample_idx = 0
sample = X_test_np[sample_idx:sample_idx+1]
pred_class = np.argmax(model.predict(sample))

print(f"\n Local Explanation for Sample #{sample_idx} → Predicted Country Group: {country_groups[pred_class]}")

# Pick SHAP values for the predicted class
sample_shap_values = shap_values[pred_class][sample_idx]

# ✅ Get top 5 features by absolute SHAP value
top_indices = np.argsort(np.abs(sample_shap_values))[-5:]
top_features = [feature_names[i] for i in top_indices]
top_shap_values = sample_shap_values[top_indices]
top_values = sample[0][top_indices]

# ✅ Visualize only top 5 as force plot
shap.initjs()
shap.force_plot(
    base_value=explainer.expected_value[pred_class],
    shap_values=top_shap_values,
    features=top_values,
    feature_names=top_features,
    matplotlib=True
)

## 4.3: Local SHAP Prediction Explanation 
### Objective
Provide transparent, instance-level explanation for a specific traveler's destination prediction.

### Approach

**Selected Instance**:
- **Sample Index**: #0 from test set
- **Predicted Destination**: `{country_groups[pred_class]}`
- **Explanation Scope**: Top 5 most influential features


In [None]:
from lime.lime_tabular import LimeTabularExplainer
import numpy as np

# Convert to numpy arrays (as you already did)
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# ✅ Wrap your model's prediction for LIME
def predict_proba_fn(X):
    preds = model.predict(X)
    # Convert to class probabilities if needed
    if preds.ndim == 1:
        preds = np.column_stack((1 - preds, preds))
    return preds

# ✅ Initialize the LIME explainer
explainer_lime = LimeTabularExplainer(
    X_train_np,
    feature_names=list(X.columns),
    class_names=country_groups,
    mode='classification'
)

# ✅ Explain one sample
sample_idx = 0
sample = X_test_np[sample_idx]
exp = explainer_lime.explain_instance(
    sample,
    predict_proba_fn,
    num_features=5
)


exp.show_in_notebook()


## 4.4: LIME Local Interpretable Model Explanations
### Objective
Provide alternative, intuitive explanations for individual predictions using locally interpretable surrogate models, complementing SHAP analysis with a different methodological approach.

### Approach

**Core Concept**: 
- Creates simple, interpretable models (like linear regression) that approximate the complex neural network's behavior **locally** around a specific prediction
- Answers: "What would a simple model look like if trained only on data similar to this specific instance?"


In [None]:
# Get feature names and weights
feature_weights = exp.as_list()  # List of tuples: [(feature_name, contribution), ...]

# Separate features and contributions
features, contributions = zip(*feature_weights)

# Plot manually
plt.figure(figsize=(8,5))
colors = ['green' if c > 0 else 'red' for c in contributions]
plt.barh(features, contributions, color=colors)
plt.xlabel("Contribution to Prediction")
plt.title(f"LIME Explanation - Sample {sample_idx}")
plt.gca().invert_yaxis()
plt.show()


## 4.5: LIME Visualization 
### Visualization Key
- **Color Coding**: Green for positive contributions, red for negative impacts
- **Horizontal Layout**: Optimal for feature name readability
- **Manual Control**: Full customization of styling and formatting
