In [151]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

## EDA

Start and stop times are provided as integers and represent periods of 10 minutes. Stream ID could be used to retrieve a single broadcast segment from a streamer (not used in our work). \
User ID (anonymized)
Stream ID
Streamer username
Time start
Time stop

In [2]:
df = pd.read_csv('./data/100k_a.csv', names=['UserID', 'StreamID', 'StreamerUsername', 'Time start', 'Time stop'])
df

Unnamed: 0,UserID,StreamID,StreamerUsername,Time start,Time stop
0,1,33842865744,mithrain,154,156
1,1,33846768288,alptv,166,169
2,1,33886469056,mithrain,587,588
3,1,33887624992,wtcn,589,591
4,1,33890145056,jrokezftw,591,594
...,...,...,...,...,...
3051728,100000,34167930576,mckytv,3582,3583
3051729,100000,34168288656,natehill,3582,3583
3051730,100000,34169594512,fortnite,3582,3584
3051731,100000,34180223072,highdistortion,3709,3711


### basic statistics

In [None]:
# number of unique users
len(df['UserID'].unique())

In [None]:
# number of unique streamers
len(df['StreamerUsername'].unique())

In [None]:
# number of streams
len(df['StreamID'].unique())

In [None]:
watchDuration = df.describe()['WatchDuration']

In [None]:
# interactions for users
userInteraction = df.groupby('UserID').size().describe()

In [None]:
# interactions for streamers
streamerInteraction = df.groupby('StreamerUsername').size().describe()

In [None]:
streamInteraction = df.groupby('StreamID').size().describe()

In [None]:
basicStatistics = pd.DataFrame([watchDuration, userInteraction, streamerInteraction, streamInteraction])
basicStatistics.index = ['Watch Duration (minutes)', 'User - num Interaction', 'Streamer - num Interaction', 'Stream - num Interaction']
basicStatistics

In [None]:
user_item_rep = df.groupby(['UserID', 'StreamerUsername']).size()
user_item_rep_plot = sns.histplot(user_item_rep, bins=range(1, 15), stat='percent')
user_item_rep_plot.set(title='User-Streamer Repetitions')
sns.despine()

In [None]:
pd.options.display.max_rows = 1000

In [None]:
user_item_rep.sort_values(ascending=False).iloc[:100]

In [None]:
df.describe()

In [None]:
duration = sns.histplot(df, x='WatchDuration', bins=16, stat='density')
duration.set(xlabel='Watch Duration in Minutes', title='Distribution of Watch Duration')
sns.despine()

In [None]:
streamer_interactions = df.groupby('StreamerUsername').size().sort_values(ascending=False).reset_index(name='InteractionCount')
streamer_interactions

In [None]:
interaction = sns.histplot(streamer_interactions['InteractionCount'], bins=10, stat='density', log_scale=True)
interaction.set(xlabel='Interaction Count (log scaled)', title='Distribution of Interaction Count for streamers')
sns.despine()

In [None]:
# big stream avg watch duration vs mid vs small

avgdurationbystreamer = df.groupby('StreamerUsername')['WatchDuration'].mean().reset_index(name='AvgWatchDuration').sort_values('AvgWatchDuration', ascending=False)
avgdurationbystreamer

## Model

### Baseline (always predict user mean)

In [147]:
df = pd.read_csv('./data/100k_a.csv', names=['UserID', 'StreamID', 'StreamerUsername', 'Time start', 'Time stop'])
df['WatchDuration'] = (df['Time stop'] - df['Time start']) * 10 
df.head()

Unnamed: 0,UserID,StreamID,StreamerUsername,Time start,Time stop,WatchDuration
0,1,33842865744,mithrain,154,156,20
1,1,33846768288,alptv,166,169,30
2,1,33886469056,mithrain,587,588,10
3,1,33887624992,wtcn,589,591,20
4,1,33890145056,jrokezftw,591,594,30


In [148]:
train, test = train_test_split(df, test_size=0.3, random_state=158)

In [149]:
# training rmse
user_mean = train.groupby('UserID')['WatchDuration'].mean()
prediction = train['UserID'].map(user_mean)
np.sqrt(mean_squared_error(train['WatchDuration'], prediction))

38.565396425916525

In [150]:
# test rmse
user_mean = test.groupby('UserID')['WatchDuration'].mean()
prediction = test['UserID'].map(user_mean)
np.sqrt(mean_squared_error(test['WatchDuration'], prediction))

37.40724564089123

### Decition Tree

In [133]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.3, random_state=158)

train = train.drop(columns=['Time start', 'Time stop'])
# average watch duration for user
user_avg_watch_duration = train.groupby('UserID')['WatchDuration'].mean()
train = train.assign(UseravgWatchDuration = train['UserID'].map(user_avg_watch_duration))
# number of streams watched by user
user_watched_stream_count = train.groupby('UserID').size()
train = train.assign(numWatchStream=train['UserID'].map(user_watched_stream_count))
# standard deviation of watch duration
variance_watch_duration = train.groupby('UserID')['WatchDuration'].var()
train = train.assign(varWatchStream=train['UserID'].map(variance_watch_duration))
# avg watch duration per streamer
streamer_avg_watch_duration = train.groupby('StreamerUsername')['WatchDuration'].mean()
train = train.assign(StreameravgWatchDuration = train['StreamerUsername'].map(streamer_avg_watch_duration))
# pouplarity of streamer (total number of unique users watching the streamer)
streamer_popularity = train.groupby('StreamerUsername')['UserID'].nunique()
train = train.assign(StreamerUniqueViewer = train['StreamerUsername'].map(streamer_popularity))
# User-Streamer Watch Count
user_streamer_watch_count = train.groupby(['UserID', 'StreamerUsername']).size().reset_index().rename(columns={0: 'UserStreamerWatchCount'})
train = train.merge(user_streamer_watch_count, on=['UserID', 'StreamerUsername'])
# Average Duration for User-Streamer Pairs
user_streamer_avg_duration = train.groupby(['UserID', 'StreamerUsername'])['WatchDuration'].mean().reset_index().rename(columns={'WatchDuration': 'UserStreamerAvgWatchDuration'})
train = train.merge(user_streamer_avg_duration, on=['UserID', 'StreamerUsername'])

In [134]:
Xtrain, Ytrain = train.drop(columns='WatchDuration'), train['WatchDuration']
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OrdinalEncoder
dt = DecisionTreeRegressor(max_depth=15)
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(train[['StreamerUsername']])
Xtrain['StreamerUsername'] = encoder.transform(Xtrain[['StreamerUsername']])
dt.fit(Xtrain, Ytrain)

In [135]:
# training mse
np.sqrt(mean_squared_error(dt.predict(Xtrain), Ytrain))

25.576405318367772

In [136]:
test = test.drop(columns=['Time start', 'Time stop'])
# average watch duration for user
user_avg_watch_duration = test.groupby('UserID')['WatchDuration'].mean()
test = test.assign(UseravgWatchDuration = test['UserID'].map(user_avg_watch_duration))
# number of streams watched by user
user_watched_stream_count = test.groupby('UserID').size()
test = test.assign(numWatchStream=test['UserID'].map(user_watched_stream_count))
# standard deviation of watch duration
variance_watch_duration = test.groupby('UserID')['WatchDuration'].var()
test = test.assign(varWatchStream=test['UserID'].map(variance_watch_duration))
# avg watch duration per streamer
streamer_avg_watch_duration = test.groupby('StreamerUsername')['WatchDuration'].mean()
test = test.assign(StreameravgWatchDuration = test['StreamerUsername'].map(streamer_avg_watch_duration))
# pouplarity of streamer (total number of unique users watching the streamer)
streamer_popularity = test.groupby('StreamerUsername')['UserID'].nunique()
test = test.assign(StreamerUniqueViewer = test['StreamerUsername'].map(streamer_popularity))
# User-Streamer Watch Count
user_streamer_watch_count = test.groupby(['UserID', 'StreamerUsername']).size().reset_index().rename(columns={0: 'UserStreamerWatchCount'})
test = test.merge(user_streamer_watch_count, on=['UserID', 'StreamerUsername'])
# Average Duration for User-Streamer Pairs
user_streamer_avg_duration = test.groupby(['UserID', 'StreamerUsername'])['WatchDuration'].mean().reset_index().rename(columns={'WatchDuration': 'UserStreamerAvgWatchDuration'})
test = test.merge(user_streamer_avg_duration, on=['UserID', 'StreamerUsername'])

In [137]:
Xtest, Ytest = test.drop(columns='WatchDuration'), test['WatchDuration']
Xtest['StreamerUsername'] = encoder.transform(Xtest[['StreamerUsername']])

In [138]:
# test rmse
np.sqrt(mean_squared_error(dt.predict(Xtest), Ytest))

24.170479555160554

### LightGBM

In [93]:
from lightgbm import LGBMRegressor
df = pd.read_csv('./data/100k_a.csv', names=['UserID', 'StreamID', 'StreamerUsername', 'Time start', 'Time stop'])
df['WatchDuration'] = (df['Time stop'] - df['Time start']) * 10 
df.head()

Unnamed: 0,UserID,StreamID,StreamerUsername,Time start,Time stop,WatchDuration
0,1,33842865744,mithrain,154,156,20
1,1,33846768288,alptv,166,169,30
2,1,33886469056,mithrain,587,588,10
3,1,33887624992,wtcn,589,591,20
4,1,33890145056,jrokezftw,591,594,30


In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.3, random_state=158)


train = train.drop(columns=['Time start', 'Time stop'])
# average watch duration for user
user_avg_watch_duration = train.groupby('UserID')['WatchDuration'].mean()
train = train.assign(UseravgWatchDuration = train['UserID'].map(user_avg_watch_duration))
# number of streams watched by user
user_watched_stream_count = train.groupby('UserID').size()
train = train.assign(numWatchStream=train['UserID'].map(user_watched_stream_count))
# standard deviation of watch duration
variance_watch_duration = train.groupby('UserID')['WatchDuration'].var()
train = train.assign(varWatchStream=train['UserID'].map(variance_watch_duration))
# avg watch duration per streamer
streamer_avg_watch_duration = train.groupby('StreamerUsername')['WatchDuration'].mean()
train = train.assign(StreameravgWatchDuration = train['StreamerUsername'].map(streamer_avg_watch_duration))
# pouplarity of streamer (total number of unique users watching the streamer)
streamer_popularity = train.groupby('StreamerUsername')['UserID'].nunique()
train = train.assign(StreamerUniqueViewer = train['StreamerUsername'].map(streamer_popularity))
# User-Streamer Watch Count
user_streamer_watch_count = train.groupby(['UserID', 'StreamerUsername']).size().reset_index().rename(columns={0: 'UserStreamerWatchCount'})
train = train.merge(user_streamer_watch_count, on=['UserID', 'StreamerUsername'])
# Average Duration for User-Streamer Pairs
user_streamer_avg_duration = train.groupby(['UserID', 'StreamerUsername'])['WatchDuration'].mean().reset_index().rename(columns={'WatchDuration': 'UserStreamerAvgWatchDuration'})
train = train.merge(user_streamer_avg_duration, on=['UserID', 'StreamerUsername'])

In [95]:
model = LGBMRegressor(n_estimators=100, random_state=158, reg_alpha=0.1, reg_lambda=0.1)
Xtrain, Ytrain = train.drop(columns='WatchDuration'), train['WatchDuration']
Xtrain['StreamerUsername'] = Xtrain['StreamerUsername'].astype('category')
model.fit(Xtrain, Ytrain)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004951 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11171
[LightGBM] [Info] Number of data points in the train set: 2136213, number of used features: 10
[LightGBM] [Info] Start training from score 31.438040


In [96]:
# training rmse
from sklearn.metrics import mean_squared_error
prediction = model.predict(Xtrain)
np.sqrt(mean_squared_error(Ytrain, prediction))

26.60898296908332

In [97]:

test = test.drop(columns=['Time start', 'Time stop'])
# average watch duration for user
user_avg_watch_duration = test.groupby('UserID')['WatchDuration'].mean()
test = test.assign(UseravgWatchDuration = test['UserID'].map(user_avg_watch_duration))
# number of streams watched by user
user_watched_stream_count = test.groupby('UserID').size()
test = test.assign(numWatchStream=test['UserID'].map(user_watched_stream_count))
# standard deviation of watch duration
variance_watch_duration = test.groupby('UserID')['WatchDuration'].var()
test = test.assign(varWatchStream=test['UserID'].map(variance_watch_duration))
# avg watch duration per streamer
streamer_avg_watch_duration = test.groupby('StreamerUsername')['WatchDuration'].mean()
test = test.assign(StreameravgWatchDuration = test['StreamerUsername'].map(streamer_avg_watch_duration))
# pouplarity of streamer (total number of unique users watching the streamer)
streamer_popularity = test.groupby('StreamerUsername')['UserID'].nunique()
test = test.assign(StreamerUniqueViewer = test['StreamerUsername'].map(streamer_popularity))
# User-Streamer Watch Count
user_streamer_watch_count = test.groupby(['UserID', 'StreamerUsername']).size().reset_index().rename(columns={0: 'UserStreamerWatchCount'})
test = test.merge(user_streamer_watch_count, on=['UserID', 'StreamerUsername'])
# Average Duration for User-Streamer Pairs
user_streamer_avg_duration = test.groupby(['UserID', 'StreamerUsername'])['WatchDuration'].mean().reset_index().rename(columns={'WatchDuration': 'UserStreamerAvgWatchDuration'})
test = test.merge(user_streamer_avg_duration, on=['UserID', 'StreamerUsername'])


Xtest, Ytest = test.drop(columns='WatchDuration'), test['WatchDuration']
Xtest['StreamerUsername'] = Xtest['StreamerUsername'].astype('category')

In [98]:
# test rmse
prediction = model.predict(Xtest)
np.sqrt(mean_squared_error(Ytest, prediction))

22.391810988244575

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
data = pd.read_csv('./data/100k_a.csv',header=None)
data.columns= ['UserID','StreamID','StreamerName','TimeStart','TimeStop']
data['WatchDuration'] = (data['TimeStop'] - data['TimeStart']) * 10 

In [37]:
train, test = train_test_split(data, test_size=0.3)

# training
user_features = train.groupby('UserID').agg(
    UserAvgWatchDuration=('WatchDuration', 'mean'),
    NumWatchStream=('StreamID', 'count'),
    VarWatchStream=('WatchDuration', 'var')
).reset_index()
streamer_features = train.groupby('StreamID').agg(
    StreamerAvgWatchDuration=('WatchDuration', 'mean'),
    StreamerUniqueViewer=('UserID', 'nunique')
).reset_index()
user_streamer_features = train.groupby(['UserID', 'StreamID']).agg(
    UserStreamerWatchCount=('WatchDuration', 'count'),
    UserStreamerAvgWatchDuration=('WatchDuration', 'mean')
).reset_index()
train = train.merge(user_features, on='UserID', how='left').merge(streamer_features, on='StreamID', how='left').merge(user_streamer_features, on=['UserID', 'StreamID'], how='left')

le_streamer = OrdinalEncoder(unknown_value=-1, handle_unknown='use_encoded_value')
le_streamer.fit(train[['StreamerName']])
train['EncodedStreamerName'] = le_streamer.transform(train[['StreamerName']])

In [38]:
Xtrain = train.drop(columns=['WatchDuration', 'StreamerName'])  
ytrain = train['WatchDuration']

In [40]:
rf_model_with_direct_ids = RandomForestRegressor(n_estimators=100)
rf_model_with_direct_ids.fit(Xtrain, ytrain)

In [41]:
# test
user_features = test.groupby('UserID').agg(
    UserAvgWatchDuration=('WatchDuration', 'mean'),
    NumWatchStream=('StreamID', 'count'),
    VarWatchStream=('WatchDuration', 'var')
).reset_index()
streamer_features = test.groupby('StreamID').agg(
    StreamerAvgWatchDuration=('WatchDuration', 'mean'),
    StreamerUniqueViewer=('UserID', 'nunique')
).reset_index()
user_streamer_features = test.groupby(['UserID', 'StreamID']).agg(
    UserStreamerWatchCount=('WatchDuration', 'count'),
    UserStreamerAvgWatchDuration=('WatchDuration', 'mean')
).reset_index()
test = test.merge(user_features, on='UserID', how='left').merge(streamer_features, on='StreamID', how='left').merge(user_streamer_features, on=['UserID', 'StreamID'], how='left')

test['EncodedStreamerName'] = le_streamer.transform(test[['StreamerName']])

In [43]:
Xtest = test.drop(columns=['WatchDuration', 'StreamerName'])  
ytest = test['WatchDuration']

In [44]:
y_pred_with_direct_ids = rf_model_with_direct_ids.predict(Xtest)
rmse_with_direct_ids = np.sqrt(mean_squared_error(ytest, y_pred_with_direct_ids))
print(rmse_with_direct_ids)

0.06630896075815305
