# MovieLens 1M + Simulated Subscription Data with Missingness
This notebook extends the MovieLens 1M dataset simulation by introducing realistic missing data in demographic and engagement fields and includes basic imputation strategies.

In [None]:
# Install dependencies
!pip install pandas numpy

In [None]:
import pandas as pd
import numpy as np
import os
np.random.seed(42)

In [None]:
# Load data (ensure files exist in current dir)
ratings = pd.read_csv('ratings.dat', sep='::', engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
users = pd.read_csv('users.dat', sep='::', engine='python',
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
movies = pd.read_csv('movies.dat', sep='::', engine='python',
                     names=['MovieID', 'Title', 'Genres'])

In [None]:
# Merge ratings with movies and users
df = ratings.merge(movies, on='MovieID', how='left')
df = df.merge(users, on='UserID', how='left')

In [None]:
# Convert timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df['WatchDate'] = df['Timestamp'].dt.date

In [None]:
# Simulate watch time
df['WatchTimeMin'] = df['Rating'] * np.random.uniform(15, 30, size=len(df))

In [None]:
# Simulate tenure
first_watch = df.groupby('UserID')['Timestamp'].min().rename('FirstWatch')
df = df.join(first_watch, on='UserID')
df['TenureMonths'] = ((df['Timestamp'] - df['FirstWatch']) / np.timedelta64(1, 'M')).astype(int)

In [None]:
# Introduce missing data
df.loc[df.sample(frac=0.1).index, 'Age'] = np.nan
df.loc[df.sample(frac=0.05).index, 'Occupation'] = np.nan
df.loc[df.sample(frac=0.03).index, 'WatchTimeMin'] = np.nan
df.loc[df.sample(frac=0.02).index, 'TenureMonths'] = np.nan

In [None]:
# Impute missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Occupation'] = df['Occupation'].fillna(df['Occupation'].mode()[0])
df['WatchTimeMin'] = df['WatchTimeMin'].fillna(df['WatchTimeMin'].mean())
df['TenureMonths'] = df['TenureMonths'].fillna(df['TenureMonths'].median())

In [None]:
# Aggregate user-level engagement metrics
df['WatchDate'] = pd.to_datetime(df['WatchDate'])
agg = df.groupby('UserID').agg({
    'WatchTimeMin': 'sum',
    'TenureMonths': 'max',
    'MovieID': 'nunique',
    'WatchDate': 'nunique'
}).rename(columns={
    'WatchTimeMin': 'TotalWatchTimeMin',
    'MovieID': 'UniqueMoviesWatched',
    'WatchDate': 'ActiveDays'
})
agg['AvgWatchTimePerDay'] = agg['TotalWatchTimeMin'] / agg['ActiveDays']
agg['WatchFrequencyPerMonth'] = agg['ActiveDays'] / (agg['TenureMonths'] + 1)

# Preview result
agg.reset_index().head()

## Simulate Treatment Assignment and Renewal Outcome
We simulate a binary treatment flag (e.g., promotion sent) and a renewal outcome (e.g., user renewed or not), with the assumption that treatment has a positive causal effect for certain user types.

In [None]:
# Simulate treatment: 50% of users get promotion (randomized control trial design)
user_ids = agg.index.to_series()
np.random.seed(42)
treatment = pd.Series(np.random.binomial(1, 0.5, size=len(user_ids)), index=user_ids)
agg['treatment'] = treatment

In [None]:
# Simulate outcome: base renewal rate = 20%, +15% uplift if treatment and engaged user
base_rate = 0.2
uplift = 0.15 * ((agg['TotalWatchTimeMin'] > agg['TotalWatchTimeMin'].median()) & (agg['treatment'] == 1)).astype(float)
agg['renewed'] = np.random.binomial(1, base_rate + uplift)

In [None]:
# Check distribution
agg['renewed'].value_counts(normalize=True)