# ***Hypothesis Testing with Simple Data Cleaning***

### Imports

In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import calendar

### Load Data and Prepare Data (run before each hypothesis test)

In [37]:
# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

# Load cleaned master dataset
df_customers = pd.read_csv('../data/simple_cleaned_customer_data.csv')
df_usage = pd.read_csv('../data/simple_cleaned_usage_logs.csv')
df_tickets = pd.read_csv('../data/simple_cleaned_tickets.csv')

# Datetime Conversions
df_tickets['created_at'] = pd.to_datetime(df_tickets['created_at'])
df_usage['date'] = pd.to_datetime(df_usage['date'])
df_customers['contract_start_date'] = pd.to_datetime(df_customers['contract_start_date'])
df_customers['contract_end_date'] = pd.to_datetime(df_customers['contract_end_date'])

# Prepare a "Customer Summary" for correlations
user_engagement = df_usage.groupby('customer_id')[['session_minutes', 'logins']].mean().reset_index()
df_master = df_customers.merge(user_engagement, on='customer_id', how='left')

# Add month column
df_master['cohort_month'] = df_master['contract_start_date'].dt.to_period('M')
df_tickets['month'] = df_tickets['created_at'].dt.month_name()

## ***Hypothesis A: Dashboard Speed***

#### "The product is broken; customers are leaving because the new dashboard is too slow."

In [32]:
# Check if "slow" tickets correlate with churn or engagement drops
# Identify "slow" tickets
perf_keywords = ['slow', 'lag', 'load', 'dashboard', 'latency']
df_tickets['is_performance_issue'] = df_tickets['ticket_text'].str.contains('|'.join(perf_keywords), case=False, na=False)

# Count performance tickets per customer
perf_counts = df_tickets[df_tickets['is_performance_issue']].groupby('customer_id').size().reset_index(name='perf_ticket_count')
df_master = df_master.merge(perf_counts, on='customer_id', how='left')
df_master['perf_ticket_count'] = df_master['perf_ticket_count'].fillna(0)

# Visualize churn rate vs engagement
plt.figure(figsize=(10, 5))
sns.boxplot(data=df_master, x='is_churned', y='session_minutes')
plt.title('Hypothesis A: Does Low Engagement Drive Churn?')
plt.xlabel('Churned (0=No, 1=Yes)')
plt.ylabel('Avg Daily Session Minutes')
plt.savefig('hyp_a_engagement_churn.png')
plt.close()

# Visualize ticket volume  of "slow" issues over time
df_tickets['month'] = df_tickets['created_at'].dt.to_period('M')
slow_ticket_trend = df_tickets[df_tickets['is_performance_issue']].groupby('month').size()
plt.figure(figsize=(10, 5))
slow_ticket_trend.plot(kind='line', marker='o')
plt.title('Hypothesis A: Trend of "Slow Dashboard" Tickets')
plt.ylabel('Number of Tickets')
plt.savefig('hyp_a_ticket_trend.png')
plt.close()

## ***Hypothesis B: Customer Quality***

#### "Sales is acquiring low-quality customers who go out of business in 3 months. The product is fine."

In [34]:
# Compare Cohort Quality 
cohort_quality = df_master.groupby('cohort_month')['initial_onboarding_score'].mean()

# Visualize onboarding score trend
plt.figure(figsize=(10, 5))
cohort_quality.plot(kind='line', marker='o', color='green')
plt.title('Hypothesis B: Onboarding Score by Cohort')
plt.ylabel('Avg Initial Onboarding Score')
plt.axhline(y=7.0, color='r', linestyle='--', label='Target Score') # Assuming 7 is good
plt.legend()
plt.savefig('hyp_b_onboarding_trend.png')
plt.close()

# ***Hypothesis C: Market Saturation***

#### "The market is heavily saturated and we are scraping the bottom of the barrel." (not in case study prompt, but addresses regional differences)

In [36]:
# Test if discounts are excesively given to gain more (low quality) customers
cohort_discount = df_master.groupby('cohort_month')['discount_pct'].mean()
cohort_acv = df_master.groupby('cohort_month')['annual_contract_value'].mean()

# Visualize discount rate trend
plt.figure(figsize=(10, 5))
cohort_discount.plot(kind='line', marker='o', color='orange')
plt.title('Hypothesis C: Discount % by Cohort')
plt.ylabel('Avg Discount Percentage')
plt.savefig('hyp_c_discount_trend.png')
plt.close()

# ***Hypothesis D: Customer Support Collapse***

#### "â€œWe are overwhelmed by tickets and nobody listens to the customer."

In [38]:
# Define chronological month order
month_order = list(calendar.month_name[1:])  # ['January', 'February', ..., 'December']

# Make 'month' a categorical with ordered months
df_tickets['month'] = pd.Categorical(
    df_tickets['month'],
    categories=month_order,
    ordered=True
)

# Observe support ticket resolution times
support_perf = df_tickets.groupby('month')['resolution_hours'].mean()

# Visualize resolution time trends
plt.figure(figsize=(10, 5))
support_perf.plot(kind='line', marker='o', color='purple')
plt.title('Hypothesis D: Avg Ticket Resolution Time (Hours)')
plt.ylabel('Hours')
plt.xlabel('Month')
plt.savefig('hyp_d_resolution_trend.png')
plt.close()

  support_perf = df_tickets.groupby('month')['resolution_hours'].mean()
