In [24]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot

# Load the dataset
# Replace 'your_dataset.csv' with the actual file path
df = pd.read_csv('data/wppool_growth_data_sample_20k.csv')


In [25]:

# Data Exploration & Cleaning
# Handle missing values
df.fillna({'total_sessions': df['total_sessions'].median(),
           'page_views': df['page_views'].median(),
           'days_active': df['days_active'].median(),
           'monthly_revenue': 0}, inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Summary of the dataset
summary = df.describe(include='all')
free_pro_distribution = df['subscription_type'].value_counts(normalize=True) * 100

# Create a list to store all visualizations
visualizations = []

# 1. Data Exploration & Cleaning
visualizations.append("<h1>Data Exploration & Cleaning</h1>")
visualizations.append(f"<p>Total Users: {len(df)}</p>")
visualizations.append(f"<p>Free Users: {free_pro_distribution['Free']:.2f}%</p>")
visualizations.append(f"<p>Pro Users: {free_pro_distribution['Pro']:.2f}%</p>")
visualizations.append(f"<p>{df.duplicated().sum()} duplicates were removed.</p>")



In [26]:
# 2. User Engagement Analysis
avg_sessions = df.groupby('subscription_type')['total_sessions'].mean()
top_users = df.nlargest(5, 'total_sessions')[['user_id', 'total_sessions', 'subscription_type']]
top_countries = df.groupby('country')['total_sessions'].sum().nlargest(5)

visualizations.append("<h1>User Engagement Analysis</h1>")
visualizations.append("<h2>Average Sessions for Free vs. Pro Users</h2>")
visualizations.append(plot(px.bar(avg_sessions, x=avg_sessions.index, y=avg_sessions.values,
                           labels={'x': 'Subscription Type', 'y': 'Average Sessions'},
                           title='Average Sessions by Subscription Type'), output_type='div', include_plotlyjs='cdn'))

visualizations.append("<h2>Top 5 Most Active Users</h2>")
visualizations.append(plot(go.Figure(data=[go.Table(
    header=dict(values=["User ID", "Total Sessions", "Subscription Type"]),
    cells=dict(values=[top_users['user_id'], top_users['total_sessions'], top_users['subscription_type']])
)]), output_type='div', include_plotlyjs='cdn'))

visualizations.append("<h2>Top 5 Countries with Highest Engagement</h2>")
visualizations.append(plot(px.bar(top_countries, x=top_countries.index, y=top_countries.values,
                           labels={'x': 'Country', 'y': 'Total Sessions'},
                           title='Top 5 Countries by Engagement'), output_type='div', include_plotlyjs='cdn'))



In [27]:
# 3. Churn Analysis
churn_rate = df.groupby('subscription_type')['churned'].mean() * 100

correlation = df.select_dtypes(include=['number']).corr()['churned'].sort_values(ascending=False)

churn_trends = df.groupby(['subscription_type', 'churned']).size().unstack()

visualizations.append("<h1>Churn Analysis</h1>")
visualizations.append("<h2>Churn Rate by Subscription Type</h2>")
visualizations.append(plot(px.pie(churn_rate, values=churn_rate.values, names=churn_rate.index,
                           title='Churn Rate by Subscription Type', hole=0.4), output_type='div', include_plotlyjs='cdn'))



visualizations.append("<h2>Churn Trends: Free vs. Pro Users</h2>")
visualizations.append(plot(px.bar(churn_trends, barmode='group',
                           labels={'value': 'Number of Users', 'subscription_type': 'Subscription Type'},
                           title='Churn Trends: Free vs. Pro Users'), output_type='div', include_plotlyjs='cdn'))



In [28]:
# 4. Revenue & Upgrade Trends
upgrade_percentage = (df[df['subscription_type'] == 'Pro']['user_id'].nunique() / df['user_id'].nunique()) * 100
total_revenue = df[df['subscription_type'] == 'Pro']['monthly_revenue'].sum()
revenue_by_plan = df[df['subscription_type'] == 'Pro'].groupby('plan_type')['monthly_revenue'].sum()
upgrade_time = df[df['subscription_type'] == 'Pro']['days_active'].mean()

visualizations.append("<h1>Revenue & Upgrade Trends</h1>")
visualizations.append("<h2>Percentage of Users Upgraded from Free to Pro</h2>")
visualizations.append(f"<p>{upgrade_percentage:.2f}%</p>")

visualizations.append("<h2>Total Monthly Revenue from Pro Users</h2>")
visualizations.append(f"<p>${total_revenue:,.2f}</p>")

visualizations.append("<h2>Revenue Contribution by Pro Plan</h2>")
visualizations.append(plot(px.pie(revenue_by_plan, values=revenue_by_plan.values, names=revenue_by_plan.index,
                           title='Revenue by Pro Plan', hole=0.4), output_type='div', include_plotlyjs='cdn'))

visualizations.append("<h2>Average Time to Upgrade (Days)</h2>")
visualizations.append(f"<p>{upgrade_time:.2f} days</p>")

# 5. Market Expansion Opportunities
revenue_by_country = df.groupby('country')['monthly_revenue'].sum().reset_index()
visualizations.append("<h1>Market Expansion Opportunities</h1>")
visualizations.append("<h2>Total Revenue by Country</h2>")
visualizations.append(plot(px.choropleth(revenue_by_country, locations='country', locationmode='country names',
                           color='monthly_revenue', hover_name='country',
                           title='Total Revenue by Country',
                           color_continuous_scale=px.colors.sequential.Plasma), output_type='div', include_plotlyjs='cdn'))



In [29]:
# 6. High-Engagement vs. Underpenetrated Markets
high_engagement = df.groupby('country')['total_sessions'].sum().nlargest(5).reset_index()
underpenetrated = df.groupby('country')['total_sessions'].sum().nsmallest(5).reset_index()
high_engagement['market_type'] = 'High Engagement'
underpenetrated['market_type'] = 'Underpenetrated'
combined_data = pd.concat([high_engagement, underpenetrated])

visualizations.append("<h1>High-Engagement vs. Underpenetrated Markets</h1>")
visualizations.append(plot(px.bar(combined_data, x='country', y='total_sessions', color='market_type',
                           labels={'x': 'Country', 'y': 'Total Sessions'},
                           title='High-Engagement vs. Underpenetrated Markets'), output_type='div', include_plotlyjs='cdn'))



In [30]:
# Combine all visualizations into a single HTML file
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>WPPOOL Growth Analytics Dashboard</title>
    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
</head>
<body>
    <h1 style="text-align: center; color: #2c3e50;">WPPOOL Growth Analytics Dashboard</h1>
    {}
</body>
</html>
""".format("\n".join(visualizations))

# Save the HTML file
with open("index.html", "w") as f:
    f.write(html_content)

print("Dashboard saved as 'index.html'")

Dashboard saved as 'index.html'
