In [21]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rockyt07/social-media-user-analysis")

print("Path to dataset files:", path)


Using Colab cache for faster access to the 'social-media-user-analysis' dataset.
Path to dataset files: /kaggle/input/social-media-user-analysis


**Setup and Loading** (Data Preprocessing)


In [22]:
import polars as pl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.cluster import KMeans
import plotly.express as px

# Load the 1M rows (Assuming file is in your Colab 'data' folder)
df = pl.read_csv(f"{path}/instagram_usage_lifestyle.csv")

# Preprocessing: Convert Categories to Numbers
le = LabelEncoder()
df_pd = df.to_pandas() # Convert to Pandas for Scikit-Learn compatibility
cat_cols = ['gender', 'income_level', 'urban_rural', 'employment_status', 'content_type_preference']

for col in cat_cols:
    df_pd[col] = le.fit_transform(df_pd[col])

Revenue & Growth - Which user attributes most accurately predict a "Premium" subscriber?



In [None]:
# Feature selection for subscription prediction
X1 = df_pd[['income_level', 'age', 'linked_accounts_count', 'daily_active_minutes_instagram']]
y1 = df_pd['uses_premium_features']

# Model Training
rf_premium = RandomForestClassifier(n_estimators=100, random_state=42)
rf_premium.fit(X1, y1)

# Business Output: Feature Importance
print("Factors driving Premium Subscriptions:")
print(pd.Series(rf_premium.feature_importances_, index=X1.columns).sort_values(ascending=False))

Marketing: Ad-Click Optimization - Can we identify high-value ad targets based on lifestyle habits?

In [None]:
# Predicting ad clicks based on lifestyle
X2 = df_pd[['exercise_hours_per_week', 'diet_quality', 'age', 'income_level']]
y2 = (df_pd['ads_clicked_per_day'] > df_pd['ads_clicked_per_day'].median()).astype(int)

# Fix: Encode 'diet_quality' column as it contains string values
from sklearn.preprocessing import LabelEncoder
le_diet_quality = LabelEncoder()
X2['diet_quality'] = le_diet_quality.fit_transform(X2['diet_quality'])

clf_ads = RandomForestClassifier(n_estimators=50).fit(X2, y2)
print(f"Ad-Targeting Model Accuracy: {clf_ads.score(X2, y2):.2%}")

In [None]:
Product: Feature Cannibalization (Reels vs. Feed)- Is there a negative correlation between Reels consumption and Feed engagement?

In [None]:
# Statistical Correlation
cannibalization_corr = df_pd['time_on_reels_per_day'].corr(df_pd['time_on_feed_per_day'])
print(f"Correlation between Reels and Feed usage: {cannibalization_corr:.4f}")

# Visualizing the trade-off
fig3 = px.scatter(df_pd.sample(2000), x="time_on_reels_per_day", y="time_on_feed_per_day", trendline="ols")
fig3.show()

Strategy: User Persona Segmentation - Can we group 1 million users into 5 distinct behavioral archetypes?

In [None]:
# Clustering based on engagement metrics
X4 = StandardScaler().fit_transform(df_pd[['reels_watched_per_day', 'daily_active_minutes_instagram', 'posts_created_per_week']])
kmeans = KMeans(n_clusters=5, random_state=42).fit(X4)
df_pd['User_Persona'] = kmeans.labels_

print("User count per Persona:")
print(df_pd['User_Persona'].value_counts())

Wellness: The "Burnout" Early Warning System - At what usage threshold does the "Perceived Stress Score" spike?

In [None]:
# Finding the 'Stress Tipping Point'
stress_by_usage = df_pd.groupby(pd.cut(df_pd['daily_active_minutes_instagram'], bins=10))['perceived_stress_score'].mean()
print("Average Stress Score by Usage Tipping Point:")
print(stress_by_usage)

In [None]:
Retention: Sleep-Engagement Equilibrium - What is the "sweet spot" of Instagram usage that doesn't ruin user sleep?

In [None]:
# Comparing Sleep vs Usage
fig6 = px.density_heatmap(df_pd.sample(5000), x="daily_active_minutes_instagram", y="sleep_hours_per_night",
                          title="Sleep vs. Usage Heatmap")
fig6.show()

Security: 2FA & Friction Analysis - Does enabling security features (2FA/Biometrics) lead to lower overall engagement?

In [None]:
# Comparing averages between security-conscious and non-secure users
security_impact = df_pd.groupby('two_factor_auth_enabled')['user_engagement_score'].mean()
print("Engagement Score: 2FA Disabled vs Enabled")
print(security_impact)

Operations: Notification Sensitivity- Does a high notification response rate actually lead to longer sessions?

In [None]:
# Regression analysis on session length
X8 = df_pd[['notification_response_rate']]
y8 = df_pd['average_session_length_minutes']
reg_sessions = RandomForestRegressor(n_estimators=10).fit(X8, y8)
print(f"Notification Impact on Session Length R2: {reg_sessions.score(X8, y8):.4f}")

Growth: Urban vs. Rural Behavioral Divergence - Do urban and rural users prefer different types of content?

In [None]:
# Content preference by region
region_prefs = df_pd.groupby(['urban_rural', 'content_type_preference']).size().unstack()
fig9 = px.bar(region_prefs, barmode='group', title="Content Preference by Region")
fig9.show()

PR/Ethics: Happiness-Driven Growth - Can we predict user happiness based on their non-digital lifestyle (books, events)?

In [None]:
# Predictive Happiness Model
X10 = df_pd[['books_read_per_year', 'social_events_per_month', 'exercise_hours_per_week']]
y10 = df_pd['self_reported_happiness']

happiness_model = RandomForestRegressor(n_estimators=50).fit(X10, y10)
print("Top Lifestyle Drivers of Happiness:")
print(pd.Series(happiness_model.feature_importances_, index=X10.columns).sort_values(ascending=False))