## HIERARCHICAL SEGMENTATION APPROACH ##          

In [0]:
# 1.1 Import Required Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import functions as F
import psycopg2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")


In [0]:
INPUT_FILENAME = 'traveltide_final_features.csv'

# Load the data directly from the CSV file saved in the previous step.
# If the file is not found, a built-in FileNotFoundError will stop the script.
eda_filtered_users_pd = pd.read_csv(INPUT_FILENAME)

print(f"Data successfully loaded from '{INPUT_FILENAME}'. Shape: {eda_filtered_users_pd.shape}")



Data successfully loaded from 'traveltide_final_features.csv'. Shape: (5998, 34)


In [0]:
eda_filtered_users_pd.columns

Index(['user_id', 'total_sessions', 'total_page_clicks', 'avg_page_clicks',
       'total_flights_booked', 'total_hotels_booked', 'total_cancellations',
       'total_joint_bookings', 'trips_with_data', 'avg_flight_discount_usd',
       'avg_hotel_discount_usd', 'avg_flight_fare_usd', 'avg_seats',
       'avg_checked_bags', 'avg_hotel_price_usd', 'avg_hotel_nights',
       'avg_hotel_total_spend_usd', 'unique_airlines_booked', 'user_birthdate',
       'user_age', 'user_age_group', 'user_gender', 'is_married',
       'has_children', 'user_home_country', 'user_home_city',
       'user_home_airport', 'home_airport_lat', 'home_airport_lon',
       'sign_up_date', 'age_group', 'spend_segment', 'weekday_travel_ratio',
       'traveller_type'],
      dtype='object')

In [0]:
# Customer Segmentation

# --- 1. CONFIGURATION AND CONSTANTS ---
# This section defines the key data-derived parameters that control the segmentation.

HIGH_ENGAGEMENT_CLICKS = 22.0
CANCELLATION_RISK_THRESHOLD = 0.10
CORE_VOLUME_AGE_GROUP = '35-49'
DYNAMIC_AGE_GROUPS = ['25-34', '50-64', '<25', '65+']
FAMILY_TRAVELLER_TYPE = 'Family'
PRIMARY_GENDER = 'Female'

# --- 2. CORE SEGMENTATION LOGIC ---

def assign_final_segment_v4(row: pd.Series) -> str:
    """
    Applies the hierarchical segmentation logic based on spend, family, engagement, and cancellation.
    """

    total_bookings = row['total_flights_booked'] + row['total_hotels_booked']
    cancellation_rate = row['total_cancellations'] / total_bookings if total_bookings > 0 else 0

    is_high_engagement = row['total_page_clicks'] >= HIGH_ENGAGEMENT_CLICKS
    is_family = row['traveller_type'] == FAMILY_TRAVELLER_TYPE
    is_high_risk = cancellation_rate > CANCELLATION_RISK_THRESHOLD

    spend = row['spend_segment']
    age = row['user_age_group']

    # 1. All Premium Spenders
    if spend == 'Premium':
        if age == CORE_VOLUME_AGE_GROUP and is_high_engagement:
            return 'Premium High Engaged Core (35-49)'
        elif age in DYNAMIC_AGE_GROUPS and is_high_engagement:
            return 'Premium High Engaged Other (Non-Core Ages)'
        elif age == CORE_VOLUME_AGE_GROUP:
            return 'Premium Core (35-49)'
        elif age in DYNAMIC_AGE_GROUPS:
            return 'Premium Other (Non-Core Ages)'
        else:
            return 'Premium: Unspecified Market'

    # 2. Family Group
    if is_family:
        return 'Family Market'

    # 3. Mid-Range Spending (High Engaged or Not)
    if spend == 'Mid-Range':
        if age == CORE_VOLUME_AGE_GROUP and is_high_engagement:
            return 'Mid-Ranged High Engaged Core (35-49)'
        elif age == CORE_VOLUME_AGE_GROUP:
            return 'Mid-Ranged Core (35-49)'
        elif age in DYNAMIC_AGE_GROUPS and is_high_engagement:
            return 'Mid-Ranged High Engaged Other  (Non-Core Ages)'
        elif age in DYNAMIC_AGE_GROUPS:
            return 'Mid-Ranged Other (Non-Core Ages)'
        else:
            return 'Volume: Unspecified Market'

    # 4. Budget Seekers
    if spend == 'Budget':
        if is_high_engagement:
            return 'Price Sensitive Buyer'
        else:
            return 'Activation: Dormant User'

    # 5. Cancellation Risk
    if is_high_risk:
        return 'At Risk Group'

    return 'Unclassified: Data Integrity Check'

def run_segmentation(eda_filtered_users_pd: pd.DataFrame):
    """
    Executes the hierarchical customer segmentation on the provided TravelTide DataFrame
    and returns the segmented DataFrame and a summary table.
    """

    df_segmented = eda_filtered_users_pd.copy()
    df_segmented['Final_Segment'] = df_segmented.apply(assign_final_segment_v4, axis=1)

    segment_counts = df_segmented['Final_Segment'].value_counts().reset_index()
    segment_counts.columns = ['Segment', 'User Count']

    total_users = segment_counts['User Count'].sum()
    segment_counts['Percentage'] = (segment_counts['User Count'] / total_users) * 100

    segment_summary = segment_counts.sort_values(by='User Count', ascending=False).reset_index(drop=True)
    segment_summary['Percentage'] = segment_summary['Percentage'].round(2).astype(str) + '%'
    segment_summary['User Count'] = segment_summary['User Count'].astype(int)

    return df_segmented, segment_summary

df_segmented, segment_summary = run_segmentation(eda_filtered_users_pd)
print(segment_summary)

                                          Segment  User Count Percentage
0                                   Family Market        1330     22.17%
1      Premium High Engaged Other (Non-Core Ages)        1180     19.67%
2            Mid-Ranged High Engaged Core (35-49)        1100     18.34%
3  Mid-Ranged High Engaged Other  (Non-Core Ages)         897     14.95%
4                           Price Sensitive Buyer         869     14.49%
5               Premium High Engaged Core (35-49)         622     10.37%



### Executive Summary

The hierarchical segmentation analysis identified six distinct user segments based on engagement and purchasing behavior:

- **Family Market**: Largest segment, representing 22.17% of users (1,330 individuals). This group is characterized by family-oriented purchasing patterns.
- **Premium High Engaged Other (Non-Core Ages)**: Comprises 19.67% of users (1,180 individuals), indicating strong engagement among premium buyers outside core age ranges.
- **Mid-Ranged High Engaged Core (35-49)**: Accounts for 18.34% of users (1,100 individuals), highlighting high engagement within the core age group.
- **Mid-Ranged High Engaged Other (Non-Core Ages)**: Represents 14.95% of users (897 individuals), showing notable engagement among mid-range buyers outside the core age group.
- **Price Sensitive Buyer**: Makes up 14.49% of users (869 individuals), characterized by a focus on value and price sensitivity.
- **Premium High Engaged Core (35-49)**: Smallest segment at 10.37% (622 individuals), consisting of highly engaged premium buyers within the core age group.

**Key Insights:**  
- The Family Market and Premium High Engaged segments together constitute over 40% of the user base.
- Engagement is strong both within and outside the core age group (35-49).
- Price sensitivity remains a significant factor for nearly 15% of users.


In [0]:
df_segmented.to_csv('df_segmented_export.csv', index=False)