# 📊 Revenue Assurance & Fraud Detection in Ed-Tech

This notebook covers Exploratory Data Analysis (EDA), Churn Prediction, and Fraud Detection using the synthetic dataset.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load CSVs
users = pd.read_csv('data/users.csv')
sessions = pd.read_csv('data/sessions.csv')
subscriptions = pd.read_csv('data/subscriptions.csv')
transactions = pd.read_csv('data/transactions.csv')
interactions = pd.read_csv('data/content_interactions.csv')
support_logs = pd.read_csv('data/support_logs.csv')

## 🔍 EDA: User Overview

In [None]:
users['signup_date'] = pd.to_datetime(users['signup_date'])
users['signup_month'] = users['signup_date'].dt.to_period('M')
sns.countplot(data=users, x='region', order=users['region'].value_counts().index)
plt.title('User Distribution by Region')
plt.xticks(rotation=45)
plt.show()

## 📉 Churn Prediction: Feature Engineering

In [None]:
# Simulate churn flag: if refund requested in >1 transaction OR drop_out = True
refund_counts = transactions.groupby('user_id')['refund_requested'].sum().reset_index()
dropouts = interactions.groupby('user_id')['drop_out'].sum().reset_index()
user_churn = pd.merge(refund_counts, dropouts, on='user_id', how='outer').fillna(0)
user_churn['churn'] = ((user_churn['refund_requested'] > 1) | (user_churn['drop_out'] > 0)).astype(int)

# Join features
features = users[['user_id', 'age', 'gender', 'region', 'device_count']].copy()
features = features.merge(user_churn[['user_id', 'churn']], on='user_id')
features = pd.get_dummies(features, columns=['gender', 'region'], drop_first=True)

# Train-test split
X = features.drop(columns=['user_id', 'churn'])
y = features['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

## 🚨 Fraud Detection with Isolation Forest

In [None]:
# Create features from transactions
fraud_df = transactions.groupby('user_id').agg({
    'amount': ['mean', 'std', 'count'],
    'refund_requested': 'sum'
}).reset_index()
fraud_df.columns = ['user_id', 'avg_amount', 'std_amount', 'num_txns', 'refunds']
fraud_df.fillna(0, inplace=True)

# Fit Isolation Forest
iso = IsolationForest(contamination=0.05, random_state=42)
fraud_df['fraud_score'] = iso.fit_predict(fraud_df[['avg_amount', 'std_amount', 'num_txns', 'refunds']])
fraud_df['fraudulent'] = fraud_df['fraud_score'].apply(lambda x: 1 if x == -1 else 0)

# Show top suspicious users
fraud_df.sort_values(by='fraudulent', ascending=False).head(10)