Load the Dataset

In [None]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("/mnt/data/streamworks_user_data.csv")

# Preview first few rows
df.head()

# Data types and structure
df.info()

df.describe()


categorical_cols = ["gender", "country", "subscription_type",
                    "received_promotions", "referred_by_friend", "is_churned"]

for col in categorical_cols:
    if col in df.columns:
        print(f"\nValue counts for {col}:")
        print(df[col].value_counts(dropna=False))



df.isnull().sum()


# Calculate correlation matrix
corr = df.select_dtypes(include=["int64", "float64"]).corr()

# Plot heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap for Numeric Variables")
plt.show()






Clean the Data

In [None]:

df["signup_date"] = pd.to_datetime(df["signup_date"], dayfirst=True, errors="coerce")
df["last_active_date"] = pd.to_datetime(df["last_active_date"], dayfirst=True, errors="coerce")



# tenure in days
df["tenure_days"] = (df["last_active_date"] - df["signup_date"]).dt.days

# loyalty flag
df["is_loyal"] = (df["tenure_days"] > 180).astype(int)

categorical_cols = ["gender", "country", "subscription_type", 
                    "received_promotions", "referred_by_friend"]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


# Drop missing target values
df = df.dropna(subset=["is_churned"])

# Fill numeric missing values
for col in df.select_dtypes(include=["float64", "int64"]).columns:
    df[col] = df[col].fillna(df[col].median())

# Fill text missing values
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].fillna("Unknown")

df.isnull().sum()

df.head()



Feature Engineering

In [None]:

import numpy as np
from sklearn.preprocessing import MinMaxScaler




# tenure_days 
df["tenure_days"] = (df["last_active_date"] - df["signup_date"]).dt.days

# loyalty flag
df["is_loyal"] = (df["tenure_days"] > 180).astype(int)

# ratio of watch hours to cost
df["watch_per_fee_ratio"] = df["average_watch_hours"] / df["monthly_fee"]

# heavy mobile user flag
df["heavy_mobile_user"] = (df["mobile_app_usage_pct"] > 70).astype(int)



# Avoid -inf by adding 1
df["log_watch_hours"] = np.log(df["average_watch_hours"] + 1)



scaler = MinMaxScaler()
df["watch_hours_scaled"] = scaler.fit_transform(df[["average_watch_hours"]])


categorical_cols = [
    "gender", 
    "country", 
    "subscription_type", 
    "received_promotions", 
    "referred_by_friend"
]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


age_bins = [0, 24, 34, 44, 54, 64, 120]
age_labels = ["<25", "25-34", "35-44", "45-54", "55-64", "65+"]

df["age_group"] = pd.cut(df["age"], bins=age_bins, labels=age_labels)

watch_bins = [0, 10, 30, 60, 200]
watch_labels = ["Low", "Medium", "High", "Very High"]

df["watch_time_category"] = pd.cut(df["average_watch_hours"], bins=watch_bins, labels=watch_labels)

df = pd.get_dummies(df, columns=["age_group", "watch_time_category"], drop_first=True)



df["low_watch_time"] = (df["average_watch_hours"] < 10).astype(int)

df["promo_and_low_watch"] = (
    (df["received_promotions_Yes"] == 1) & (df["low_watch_time"] == 1)
).astype(int)


cols_to_drop = ["signup_date", "last_active_date"]

df = df.drop(columns=cols_to_drop, errors="ignore")


Statitistical Analysis

In [None]:
from scipy import stats
import pandas as pd

def chi_square_test(cat_col, target="is_churned"):
    contingency = pd.crosstab(df[cat_col], df[target])
    chi2, p, dof, expected = stats.chi2_contingency(contingency)
    
    print(f"\nChi-Square Test: {cat_col} vs {target}")
    print("Chi2 Statistic:", chi2)
    print("p-value:", p)
    print("Degrees of Freedom:", dof)
    print("\nContingency Table:")
    print(contingency)

# Run tests for the required categorical columns
for col in ["gender", "received_promotions", "referred_by_friend"]:
    if col in df.columns:
        chi_square_test(col)


churned = df[df["is_churned"] == 1]["average_watch_hours"]
retained = df[df["is_churned"] == 0]["average_watch_hours"]

t_stat, p_val = stats.ttest_ind(churned, retained, equal_var=False)  # Welch t-test

print("T-test: Watch Time Difference Between Churned & Retained Users")
print("t-statistic:", t_stat)
print("p-value:", p_val)
print("Mean watch hours (Churned):", churned.mean())
print("Mean watch hours (Retained):", retained.mean())



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))
corr_matrix = df.select_dtypes(include=["int64", "float64"]).corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap (Numeric Variables)")
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(x="is_churned", y="average_watch_hours", data=df)
plt.title("Watch Hours by Churn Status")
plt.xticks([0, 1], ["Active (0)", "Churned (1)"])
plt.show()


plt.figure(figsize=(7,4))
sns.countplot(x="subscription_type", hue="is_churned", data=df)
plt.title("Churn by Subscription Type")
plt.show()

plt.figure(figsize=(7,4))
sns.histplot(df["average_watch_hours"], kde=True)
plt.title("Distribution of Average Watch Hours")
plt.xlabel("Average Watch Hours")
plt.show()

plt.figure(figsize=(6,4))
sns.countplot(x="gender", hue="is_churned", data=df)
plt.title("Churn Distribution by Gender")
plt.show()



Predictive Modelling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score


X = df.drop("is_churned", axis=1)
y = df["is_churned"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]


cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix — Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


auc = roc_auc_score(y_test, y_prob)
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0,1], [0,1], "--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — Logistic Regression")
plt.legend()
plt.show()

coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": log_reg.coef_[0]
}).sort_values(by="coefficient", key=abs, ascending=False)

coef_df.head(15)


In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

target = "average_watch_hours"

# Select features but remove the target and churn column
features = df.drop(["average_watch_hours", "is_churned"], axis=1)

X_reg = features
y_reg = df[target]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.25, random_state=42
)

scaler_reg = StandardScaler()
X_train_r_scaled = scaler_reg.fit_transform(X_train_r)
X_test_r_scaled = scaler_reg.transform(X_test_r)



lin_reg = LinearRegression()
lin_reg.fit(X_train_r_scaled, y_train_r)

y_pred_r = lin_reg.predict(X_test_r_scaled)


print("R²:", r2_score(y_test_r, y_pred_r))
print("RMSE:", np.sqrt(mean_squared_error(y_test_r, y_pred_r)))
print("MAE:", mean_absolute_error(y_test_r, y_pred_r))

residuals = y_test_r - y_pred_r

plt.figure(figsize=(6,4))
plt.scatter(y_pred_r, residuals, alpha=0.5)
plt.axhline(0, linestyle="--", color="red")
plt.xlabel("Predicted Watch Hours")
plt.ylabel("Residuals")
plt.title("Residual Plot — Linear Regression")
plt.show()

coef_reg_df = pd.DataFrame({
    "feature": X_reg.columns,
    "coefficient": lin_reg.coef_
}).sort_values(by="coefficient", key=abs, ascending=False)

coef_reg_df.head(15)
