In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('hotel_booking.csv')
df

In [None]:
df.shape

In [None]:
# dropped useles data
df.drop(columns=['email', 'phone-number', 'credit_card'], inplace=True)

In [None]:
df.columns
df.info()

In [None]:
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])

In [None]:
df.describe(include = 'object')

In [None]:
for col in df.describe(include = 'object').columns:
    print(col)
    print(df[col].unique())
    print('-'*50)
    

In [None]:
df.isnull().sum()

In [None]:
#dropped useless fields
df.drop(['company','agent'], axis = 1, inplace = True)

In [None]:
#dropping null values 
df.dropna(inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
#it is having heavy outliers
df['adr'].plot(kind='box')

In [None]:
df = df[df['adr']<5000]

## Data Analysis and Visualizations

In [None]:
# Set professional styling
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Enhanced color schemes
colors = ['#2E8B57', '#DC143C', '#4169E1', '#FF6347', '#32CD32', '#FF1493']
custom_palette = sns.color_palette(colors)

In [None]:
# 1. ENHANCED CANCELLATION RATE ANALYSIS
print("="*60)
print("HOTEL BOOKING CANCELLATION ANALYSIS - PROFESSIONAL DASHBOARD")
print("="*60)

In [None]:
# Calculate cancellation percentage
cancelled_perc = df['is_canceled'].value_counts(normalize=True)
print(f"\nCancellation Statistics:")
print(f"Non-cancelled bookings: {cancelled_perc[0]:.1%}")
print(f"Cancelled bookings: {cancelled_perc[1]:.1%}")


In [None]:
# Professional bar chart with enhanced styling
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

In [None]:
# Enhanced bar plot
bars = ax1.bar(['Not Cancelled', 'Cancelled'], 
               df['is_canceled'].value_counts(), 
               color=['#2E8B57', '#DC143C'],
               edgecolor='white', 
               linewidth=2,
               alpha=0.8)

In [None]:
# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 1000,
             f'{int(height):,}', ha='center', va='bottom', 
             fontweight='bold', fontsize=12)

ax1.set_title('Hotel Booking Status Distribution', 
              fontsize=16, fontweight='bold', pad=20)
ax1.set_ylabel('Number of Reservations', fontsize=12, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.set_ylim(0, max(df['is_canceled'].value_counts()) * 1.1)

In [None]:
# Enhanced pie chart
wedges, texts, autotexts = ax2.pie(df['is_canceled'].value_counts(), 
                                   labels=['Not Cancelled', 'Cancelled'],
                                   colors=['#2E8B57', '#DC143C'],
                                   autopct='%1.1f%%',
                                   startangle=90,
                                   explode=(0.05, 0.05),
                                   shadow=True)

ax2.set_title('Cancellation Rate Distribution', 
              fontsize=16, fontweight='bold', pad=20)

In [None]:
# Enhance text properties
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(12)
    autotext.set_fontweight('bold')

plt.tight_layout()
plt.show()

In [None]:
# 2. ENHANCED HOTEL TYPE COMPARISON
plt.figure(figsize=(12, 8))
sns.countplot(data=df, x='hotel', hue='is_canceled', 
              palette=['#2E8B57', '#DC143C'], 
              alpha=0.8)

plt.title('Reservation Status by Hotel Type', 
          fontsize=18, fontweight='bold', pad=20, color='#2F4F4F')
plt.xlabel('Hotel Type', fontsize=14, fontweight='bold')
plt.ylabel('Number of Reservations', fontsize=14, fontweight='bold')
plt.legend(labels=['Not Cancelled', 'Cancelled'], 
           title='Booking Status', title_fontsize=12, fontsize=11)

In [None]:
# Add value labels
ax = plt.gca()
for container in ax.containers:
    ax.bar_label(container, fmt='%d', fontweight='bold')

plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Calculate and display percentages for each hotel type
resort_hotels = df[df['hotel'] == 'Resort Hotel']
city_hotels = df[df['hotel'] == 'City Hotel']

print(f"\nResort Hotel Cancellation Rate: {resort_hotels['is_canceled'].mean():.1%}")
print(f"City Hotel Cancellation Rate: {city_hotels['is_canceled'].mean():.1%}")

In [None]:
# 3. ENHANCED ADR COMPARISON WITH PLOTLY
resort_hotels_adr = resort_hotels.groupby('reservation_status_date')[['adr']].mean()
city_hotels_adr = city_hotels.groupby('reservation_status_date')[['adr']].mean()

fig = go.Figure()

fig.add_trace(go.Scatter(x=resort_hotels_adr.index, 
                         y=resort_hotels_adr['adr'],
                         mode='lines+markers',
                         name='Resort Hotel',
                         line=dict(color='#2E8B57', width=3),
                         marker=dict(size=6, color='#2E8B57')))

fig.add_trace(go.Scatter(x=city_hotels_adr.index, 
                         y=city_hotels_adr['adr'],
                         mode='lines+markers',
                         name='City Hotel',
                         line=dict(color='#DC143C', width=3),
                         marker=dict(size=6, color='#DC143C')))

fig.update_layout(
    title={
        'text': 'Average Daily Rate (ADR) Trends by Hotel Type',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20, 'family': 'Arial Black'}
    },
    xaxis_title='Date',
    yaxis_title='Average Daily Rate ($)',
    font=dict(size=12),
    hovermode='x unified',
    template='plotly_white',
    height=500,
    showlegend=True,
    legend=dict(x=0.02, y=0.98)
)

fig.show()

In [None]:
# 4. ENHANCED MONTHLY ANALYSIS
df['month'] = df['reservation_status_date'].dt.month
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# Create subplots for monthly analysis
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 12))

# Monthly reservation status
monthly_data = df.groupby(['month', 'is_canceled']).size().unstack()
monthly_data.index = [month_names[i-1] for i in monthly_data.index]

monthly_data.plot(kind='bar', ax=ax1, color=['#2E8B57', '#DC143C'], 
                  alpha=0.8, width=0.7)
ax1.set_title('Monthly Reservation Status Distribution', 
              fontsize=16, fontweight='bold', pad=20)
ax1.set_xlabel('Month', fontsize=12, fontweight='bold')
ax1.set_ylabel('Number of Reservations', fontsize=12, fontweight='bold')
ax1.legend(['Not Cancelled', 'Cancelled'], title='Status')
ax1.grid(True, alpha=0.3)
ax1.tick_params(axis='x', rotation=0)

# Monthly ADR for cancelled bookings
monthly_adr = df[df['is_canceled'] == 1].groupby('month')['adr'].mean()
monthly_adr.index = [month_names[i-1] for i in monthly_adr.index]

bars = ax2.bar(monthly_adr.index, monthly_adr.values, 
               color='#FF6347', alpha=0.8, edgecolor='white', linewidth=2)
ax2.set_title('Average Daily Rate for Cancelled Bookings by Month', 
              fontsize=16, fontweight='bold', pad=20)
ax2.set_xlabel('Month', fontsize=12, fontweight='bold')
ax2.set_ylabel('Average Daily Rate ($)', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 2,
             f'${height:.0f}', ha='center', va='bottom', 
             fontweight='bold', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# 5. ENHANCED TOP COUNTRIES ANALYSIS WITH PLOTLY
cancelled_data = df[df['is_canceled'] == 1]
top_10_country = cancelled_data['country'].value_counts()[:10]

# Create interactive pie chart
fig = px.pie(values=top_10_country.values, 
             names=top_10_country.index,
             title='Top 10 Countries with Cancelled Reservations',
             color_discrete_sequence=px.colors.qualitative.Set3)

fig.update_traces(textposition='inside', textinfo='percent+label',
                  hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>')

fig.update_layout(
    title={
        'text': 'Top 10 Countries with Cancelled Reservations',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20, 'family': 'Arial Black'}
    },
    font=dict(size=12),
    height=600,
    showlegend=True
)

fig.show()

print(f"\nTop Country for Cancellations: {top_10_country.index[0]} ({top_10_country.iloc[0]:,} cancellations)")

In [None]:
# 6. MARKET SEGMENT ANALYSIS
print(f"\nMarket Segment Distribution:")
market_dist = df['market_segment'].value_counts(normalize=True)
for segment, pct in market_dist.head().items():
    print(f"{segment}: {pct:.1%}")

cancelled_market = cancelled_data['market_segment'].value_counts(normalize=True)
print(f"\nCancelled Bookings by Market Segment:")
for segment, pct in cancelled_market.head().items():
    print(f"{segment}: {pct:.1%}")


In [None]:
# 7. ENHANCED ADR TREND ANALYSIS
cancelled_df_adr = cancelled_data.groupby('reservation_status_date')[['adr']].mean().reset_index()
not_cancelled_data = df[df['is_canceled'] == 0]
not_cancelled_df_adr = not_cancelled_data.groupby('reservation_status_date')[['adr']].mean().reset_index()

# Filter data for better visualization
date_mask = (cancelled_df_adr['reservation_status_date'] > '2016') & (cancelled_df_adr['reservation_status_date'] < '2017-09')
cancelled_df_adr = cancelled_df_adr[date_mask]
not_cancelled_df_adr = not_cancelled_df_adr[date_mask]

# Create interactive comparison chart
fig = go.Figure()

fig.add_trace(go.Scatter(x=not_cancelled_df_adr['reservation_status_date'],
                         y=not_cancelled_df_adr['adr'],
                         mode='lines+markers',
                         name='Not Cancelled',
                         line=dict(color='#2E8B57', width=3),
                         marker=dict(size=6),
                         fill='tonexty'))

fig.add_trace(go.Scatter(x=cancelled_df_adr['reservation_status_date'],
                         y=cancelled_df_adr['adr'],
                         mode='lines+markers',
                         name='Cancelled',
                         line=dict(color='#DC143C', width=3),
                         marker=dict(size=6),
                         fill='tozeroy'))

fig.update_layout(
    title={
        'text': 'Average Daily Rate Comparison: Cancelled vs Not Cancelled',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20, 'family': 'Arial Black'}
    },
    xaxis_title='Date',
    yaxis_title='Average Daily Rate ($)',
    font=dict(size=12),
    hovermode='x unified',
    template='plotly_white',
    height=500,
    showlegend=True
)

fig.show()


In [None]:
# 8. SUMMARY INSIGHTS
print("\n" + "="*60)
print("KEY INSIGHTS SUMMARY")
print("="*60)
print("1. Cancellation Rate: 37% of all bookings are cancelled")
print("2. City Hotels have higher cancellation rates than Resort Hotels")
print("3. Average Daily Rate (ADR) is a major factor influencing cancellations")
print("4. Portugal (PRT) has the highest number of cancellations")
print("5. Online Travel Agencies (TA/TO) show highest cancellation rates")
print("6. Higher ADR periods correlate with increased cancellation rates")
print("7. Summer months show both higher rates and higher cancellations")
print("="*60)
