In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import os
import scipy.stats as stats
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from scipy.stats import zscore
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import uuid

In [None]:
os.path.isfile("caribbean_ports.csv")
caribbean_ports = pd.read_csv("caribbean_ports.csv")
os.path.isfile("cruise_cancellations.csv")
cruise_cancellations = pd.read_csv("cruise_cancellations.csv")
os.path.isfile("voyage_options.csv")
voyage_df = pd.read_csv("voyage_options.csv")
os.path.isfile("cruise_pics.csv")
ab_pics = pd.read_csv('cruise_pics.csv')

## **I. Summary Stats**

### **Dataset Summary**

In [None]:
caribbean_ports.describe()

In [None]:
caribbean_ports.head(5)

### **Handling Missing Data and Impossible Values**

In [None]:
print(caribbean_ports.isna().sum().sort_values(ascending=False))

In [None]:
# Check for invalid avg_port_fee_usd
print(caribbean_ports[caribbean_ports['avg_port_fee_usd'] < 0])

# Check for invalid local_attractions_score
print(caribbean_ports[(caribbean_ports['local_attractions_score'] < 1) | (caribbean_ports['local_attractions_score'] > 10)])

# Check for invalid avg_customer_satisfaction
print(caribbean_ports[(caribbean_ports['avg_customer_satisfaction'] < 1) | (caribbean_ports['avg_customer_satisfaction'] > 10)])

# Check for invalid avg_disembark_rate
print(caribbean_ports[(caribbean_ports['avg_disembark_rate'] < 0) | (caribbean_ports['avg_disembark_rate'] > 1)])

# Check for invalid avg_shore_spend_per_passenger
print(caribbean_ports[caribbean_ports['avg_shore_spend_per_passenger'] < 0])

In [None]:
def clean_group_mean(df, column, valid_min, valid_max):
    valid = df[(df[column] >= valid_min) & (df[column] <= valid_max)]
    return valid.groupby('country_or_territory')[column].transform('mean')

# 1. avg_port_fee_usd (should be >= 0)
mask = caribbean_ports['avg_port_fee_usd'] < 0
valid_means = clean_group_mean(caribbean_ports, 'avg_port_fee_usd', 0, float('inf'))
caribbean_ports.loc[mask, 'avg_port_fee_usd'] = valid_means[mask]

# 2. local_attractions_score (1–10)
mask = (caribbean_ports['local_attractions_score'] < 1) | (caribbean_ports['local_attractions_score'] > 10)
valid_means = clean_group_mean(caribbean_ports, 'local_attractions_score', 1, 10)
caribbean_ports.loc[mask, 'local_attractions_score'] = valid_means[mask]

# 3. avg_customer_satisfaction (1–10)
mask = (caribbean_ports['avg_customer_satisfaction'] < 1) | (caribbean_ports['avg_customer_satisfaction'] > 10)
valid_means = clean_group_mean(caribbean_ports, 'avg_customer_satisfaction', 1, 10)
caribbean_ports.loc[mask, 'avg_customer_satisfaction'] = valid_means[mask]

# 4. avg_disembark_rate (0–1)
mask = (caribbean_ports['avg_disembark_rate'] < 0) | (caribbean_ports['avg_disembark_rate'] > 1)
valid_means = clean_group_mean(caribbean_ports, 'avg_disembark_rate', 0, 1)
caribbean_ports.loc[mask, 'avg_disembark_rate'] = valid_means[mask]

# 5. avg_shore_spend_per_passenger (>= 0)
mask = caribbean_ports['avg_shore_spend_per_passenger'] < 0
valid_means = clean_group_mean(caribbean_ports, 'avg_shore_spend_per_passenger', 0, float('inf'))
caribbean_ports.loc[mask, 'avg_shore_spend_per_passenger'] = valid_means[mask]

In [None]:
caribbean_ports.describe()

During the data validation process, no missing (null) values were found across any columns in the caribbean_ports dataset. However, upon further inspection using summary statistics, several variables contained impossible values outside their expected ranges, such as negative port fees, customer satisfaction scores above 10, and disembarkation rates exceeding 1. To address these issues, invalid entries were imputed by replacing them with the mean of valid values from the same country_or_territory. This ensured that the corrections respected the appropriate business logic while maintaining consistency within each country group.

### **Insights**

In [None]:
region_visitors = caribbean_ports.groupby('region')['avg_annual_visitors'].mean()
print(region_visitors)

Ports in the Southern Caribbean attract the highest average number of annual visitors (over 812,000), followed by the Eastern Caribbean. In contrast, the Bahamas and Western regions receive significantly fewer visitors on average.

In [None]:
port_type_spend = caribbean_ports.groupby('port_type')['avg_shore_spend_per_passenger'].mean()
print(port_type_spend)

Passengers tend to spend the most at Major ports (123.65 USD on average), followed closely by Boutique and Eco/Nature ports. Private ports show the lowest average shore spending at about 93.09 USD per passenger.

In [None]:
pivot_cust_sat = caribbean_ports.pivot_table(values='avg_customer_satisfaction', index='region', columns='port_type', aggfunc='mean')
print(pivot_cust_sat)

Customer satisfaction is highest at Boutique ports across all regions, especially in the Bahamas with an average score of 9.21. Major ports generally receive lower satisfaction scores compared to Boutique and Private ports. The missing values in the pivot table occur because there are no Eco/Nature ports in the Bahamas and no Major ports in the Southern region.

In [None]:
top_satisfaction = caribbean_ports[['country_or_territory', 'avg_customer_satisfaction']].sort_values(by='avg_customer_satisfaction', ascending=False).head(5)
print(top_satisfaction)

Honduras stands out with the highest average customer satisfaction score (9.96), followed by Mexico, the British Virgin Islands, Grenada, and the Bahamas, all with scores above 9.5. These ports appear to offer the most positive experiences for cruise passengers.

In [None]:
top_attractions = caribbean_ports[['port_name', 'country_or_territory', 'local_attractions_score']] \
    .sort_values(by='local_attractions_score', ascending=False) \
    .head(5)

print(top_attractions)

Progreso and Port Antonio stand out with the highest local attractions scores (9.0), followed closely by Half Moon Cay, St. Lucia, and Bimini with scores of 8.0. This suggests these ports offer a broader range of attractions compared to other Caribbean destinations.

In [None]:
country_summary = caribbean_ports.groupby('country_or_territory').agg(
    total_annual_visitors=('avg_annual_visitors', 'sum'),
    avg_customer_satisfaction=('avg_customer_satisfaction', 'mean')
).reset_index()

country_summary = country_summary.sort_values(by='avg_customer_satisfaction', ascending=False)

print(country_summary)

British Virgin Islands and Grenada report the highest average customer satisfaction scores, each achieving 9.56. Despite not having the highest visitor volumes, these destinations offer experiences that passengers rate extremely highly. In contrast, larger markets such as the Dominican Republic and Venezuela attract significant numbers of visitors but show noticeably lower satisfaction levels, suggesting that higher traffic does not always translate into better passenger experiences.

### **Findings**

The exploratory analysis indicates that cruise passengers are most attracted to ports offering a combination of strong local attractions and specialized experiences, such as Boutique ports. Regions with higher visitor volumes, like the Southern and Eastern Caribbean, do not necessarily correlate with higher customer satisfaction, suggesting that passenger experience is more closely tied to port characteristics than to overall traffic. Ports classified as Major drive the highest shore spending, yet their satisfaction scores tend to be lower compared to Boutique and Private ports, highlighting a potential trade-off between economic gain and visitor experience. At the country level, smaller territories such as the British Virgin Islands and Grenada report the highest customer satisfaction scores, while larger markets like the Dominican Republic and Venezuela show noticeably lower ratings. Furthermore, the identification and correction of invalid data ensured that conclusions were based on consistent and realistic figures. The results suggest that enhancing local attractions and improving specialized experiences could be more effective strategies for ports aiming to increase both visitor satisfaction and economic impact. However, it is important to note that the dataset had some initial inconsistencies, including impossible values for several key variables, which required careful correction. In addition, not all region and port type combinations were represented, potentially limiting the generalizability of some findings across the broader Caribbean cruise market.

Based on the data, Lobster Land should prioritize partnering with or operating in Boutique ports, as they consistently receive the highest customer satisfaction scores across all regions. Although Major ports attract higher spending per passenger, Boutique ports strike a balance between strong spending and a superior passenger experience—an important factor for brand positioning. The Bahamas and Eastern Caribbean also stand out as attractive regions due to high visitor volumes and strong satisfaction scores. Additionally, ports like Progreso and Port Antonio, with top local attractions scores, suggest strong tourism value and engagement opportunities. Focusing on high-satisfaction, mid-to-high-spend destinations will help Lobster Land deliver a premium yet enjoyable cruise experience from day one.

## **II. Segmentation and Targeting**

### **Data Loading and Exploration**

In [None]:
df = pd.read_csv('caribbean_ports.csv')
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Remove impossible values
df = df[df['avg_annual_visitors'] >= 0]
df = df[df['avg_port_fee_usd'] >= 0]
df = df[df['avg_shore_spend_per_passenger'] >= 0]
df = df[(df['avg_customer_satisfaction'] >= 1) & (df['avg_customer_satisfaction'] <= 10)]
df = df[(df['local_attractions_score'] >= 1) & (df['local_attractions_score'] <= 10)]
df = df[(df['avg_disembark_rate'] >= 0) & (df['avg_disembark_rate'] <= 1)]

In [None]:
# Summary statistics for numeric variables
numeric_cols = [
    'avg_annual_visitors', 'avg_port_fee_usd', 'local_attractions_score',
    'avg_customer_satisfaction', 'excursion_variety_index', 'avg_disembark_rate',
    'avg_shore_spend_per_passenger', 'seasonality_score', 'latitude', 'longitude'
]
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[numeric_cols])

print("\nSummary Statistics for Numeric Variables after Cleaning:")
df[numeric_cols].describe()

In [None]:
# Elbow Method to find optimal k
inertia = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(8, 6))
plt.plot(k_range, inertia, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()

In [None]:
final_kmeans = KMeans(n_clusters=7, random_state=42, n_init=10)
df['cluster'] = final_kmeans.fit_predict(X_scaled)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='avg_annual_visitors', y='avg_shore_spend_per_passenger', hue='cluster', palette='Set1', data=df)
plt.title('Clusters: Annual Visitors vs Shore Spend per Passenger')
plt.xlabel('Average Annual Visitors (millions)')
plt.ylabel('Average Shore Spend per Passenger (USD)')
plt.legend(title='Cluster')
plt.grid(True)
plt.show()

*As per the elbow chart, the optimal value seems to be 7, but after our scatter plot of the cluster, the plots didn't seems to occupy these many clusters, as we can see the cluster 6 isn't able to show these many data points to so some analysis on that. hence, for this analysis, we will be using k as 6.