In [2]:
import pandas as pd
import numpy as np

#For plotting
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from datetime import datetime

# Import the SimpleExpSmoothing object
from statsmodels.tsa.api import SimpleExpSmoothing
# Import the ARIMA object
from statsmodels.tsa.arima.model import ARIMA

import random

In [3]:
# Get time series data:
timedata = pd.read_csv('SteamCharts.csv',low_memory=False)

# Replace inf to NaN:
timedata = timedata.replace({np.inf:np.nan})
timedata = timedata.replace({'+Inf':np.nan})

In [4]:
# Get Avg_Players.
avg_players = timedata.loc[timedata['Avg_Players']==1].reset_index(drop=True)
avg_pure = avg_players.drop(columns=['App_id', 'Name', 'Avg_Players', 'Gain', 'Perc_Gain', 'Peak_Players',
       'Last 30 Days']).astype(float)

avg_players['max'] = avg_pure.max(axis=1)
avg_pure = avg_pure.drop(avg_players.loc[avg_players['max']<=0].index).drop(avg_players.loc[avg_players['max'].isna()==True].index)
avg_pure = avg_pure.reset_index(drop=True)

avg_players = avg_players.drop(avg_players.loc[avg_players['max']<=0].index).drop(avg_players.loc[avg_players['max'].isna()==True].index)
avg_players = avg_players.reset_index(drop=True)

birth = [avg_pure.iloc[i].last_valid_index() for i in range(len(avg_pure))]
avg_players['birth']=birth

Select games with the max of avg_players larger than or equal to 10 and are created between July 2012 and March 2021. There are 6471 such games.

In [5]:
avg1 = avg_players.loc[avg_players['max']>=10]
avg_pure1 = avg_pure.loc[avg_players['max']>=10]

avg1 = avg1.loc[avg_players['birth']!='July 2012']
avg1 = avg1.loc[avg_players['birth']!='March 2021']
avg1 = avg1.loc[avg_players['birth']!='April 2021'].reset_index(drop=True)

avg_pure1 = avg_pure1.loc[avg_players['birth']!='July 2012']
avg_pure1 = avg_pure1.loc[avg_players['birth']!='March 2021']
avg_pure1 = avg_pure1.loc[avg_players['birth']!='April 2021'].reset_index(drop=True)

Select games with no NaN data in between

In [6]:
incomplete_data = []
for i in range(len(avg1)):
    temp = avg_pure1.iloc[i]
    if np.isnan(temp[temp.first_valid_index():temp.last_valid_index()]).any():
        incomplete_data.append(i)
len(incomplete_data)

768

In [7]:
avg2 = avg1.drop(incomplete_data).reset_index(drop=True)
avg_pure2 = avg_pure1.drop(incomplete_data).reset_index(drop=True)

Let's focus our attention on predicting 6 horizon given 12 month datas of games we want to predict.

Note that different horizon or given data may change the accuracy or the best smoothing/metric for prediction.

In [8]:
#dropping datas with less than 18 months
X = avg_pure2.copy()
list1 = []
for i in range(len(X)):
    if len(X.iloc[i][X.iloc[i].first_valid_index():X.iloc[i].last_valid_index()])<18:
        list1.append(i)
X=X.drop(index = list1).reset_index(drop=True)

In [9]:
# Train Test Split

test_index = random.sample(range(len(X)),int(len(X)/4))
X_test = X.iloc[test_index]

train_index = list(range(len(X)))
for i in test_index:
    train_index.remove(i)
X_train = X.iloc[train_index].reset_index(drop = True)

In [10]:
X_train

Unnamed: 0,April 2021,March 2021,February 2021,January 2021,December 2020,November 2020,October 2020,September 2020,August 2020,July 2020,...,April 2013,March 2013,February 2013,January 2013,December 2012,November 2012,October 2012,September 2012,August 2012,July 2012
0,186110.65,193114.18,198957.52,201247.19,189233.58,179520.26,162585.64,169093.71,192492.61,211700.30,...,,,,,,,,,,
1,45629.18,53350.61,59394.05,44864.92,67000.69,99899.97,50809.40,53312.41,64263.74,68439.75,...,,,,,,,,,,
2,77590.25,86369.82,117742.27,142117.25,61171.65,55408.03,48550.80,46430.53,53287.65,58238.72,...,,,,,,,,,,
3,46416.38,42951.59,39209.96,36124.14,35990.05,39914.02,41298.79,56330.88,59629.02,39385.15,...,12329.65,1960.13,0.0,0.0,,,,,,
4,36127.13,38513.38,41441.43,44647.85,40570.33,36057.62,32493.44,33944.49,35429.43,33497.40,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3212,6.39,12.99,15.77,9.66,9.03,9.70,7.80,12.50,20.95,15.24,...,,,,,,,,,,
3213,0.80,1.06,1.54,1.24,1.16,0.84,1.37,0.76,1.74,2.15,...,,,,,,,,,,
3214,3.33,4.01,2.72,2.30,2.45,2.91,2.92,3.05,2.79,2.31,...,,,,,,,,,,
3215,3.57,4.25,4.73,5.31,5.30,5.15,4.75,4.54,5.91,5.87,...,,,,,,,,,,


With the data prepared, we write the pieces of our model one by one.

In [11]:
#Exponential smoothing:

def smooth_values(data, i_game, smoothing_level=.2):

    game_data = data.iloc[i_game]
    months = pd.to_datetime(game_data.index)
    game = pd.DataFrame({'Month':months,'Data': game_data.astype(float)}).sort_values(by=['Month'])
    temp = game['Data'][game['Data'].first_valid_index():game['Data'].last_valid_index()]
    
    # Fit exponential smoothing
    ses = SimpleExpSmoothing(temp.values)
    fit = ses.fit(smoothing_level=smoothing_level, optimized=False)
    return fit.fittedvalues

Define the metric.

In [12]:
def rootmse(a, b):
    return np.sqrt(np.sum((a-b)**2))

def Gauss_weight(a, b, epsilon = 20):
    return np.exp(-epsilon*rootmse(a,b)**2)

To compare the shape of 2 curves, we want to compute the scalar that makes 2 curves close together by scaling only.

In [13]:
def mini_scaler(a,b):
    return np.sum(a*b)/np.sum(a**2)

With the weight function and minimizing scalar defined above, we define the weight average function that acts as our prediction.

In [14]:
def wt_avg(game, data, metric = Gauss_weight, epsilon = 20, threshold=0.4, horizon = 6):
    length = len(game)
    if np.max(np.abs(game))!=0:
        game_scaled = game / rootmse(game,0)
    else:
        game_scaled = game
    pred = np.zeros(length+horizon)
    close_index = np.zeros((len(data),2))
    j=0
    for i in range(len(data)):
        temp = data[i]
        if len(temp)>=length+horizon:
            if np.max(np.abs(temp[:length]))!=0:
                temp_scaled = temp * mini_scaler(temp[:length],game_scaled)
            else: temp_scaled = temp
            weight=metric(game_scaled,temp_scaled[:length], epsilon = epsilon)
            if weight >= threshold:
                pred = pred + weight * temp_scaled[:length+horizon]
                close_index[j]=[i,weight]
                j=j+1
    if np.max(np.abs(pred[:length])) !=0:
        pred = pred * mini_scaler(pred[:length],game)
    close_index = close_index[:j]
    close_index = close_index[np.argsort(close_index[:, 1])][::-1]
    return pred, close_index

Now we define the error measurement we want to minimise. Since we used l2 norm, i.e. rootmse, in the above calculation, we also use that here.

In order to calculate average 'percentage error' over the test set, we normalize the error by the l2 norm of the test data. Equivalently, we are normalizing the l2 norm of the predicted values and the test data before computing the error.

Here percentage error refers to the error relative to the l2 norm of test data

In [15]:
def Get_error(train, test, threshold = 0.4, epsilon = 20):
    error = 0
    for i in range(len(test)):
        [pred, close_index] = wt_avg(test[i][:12],train, threshold = threshold, epsilon = epsilon, horizon = 6)
        error = error + rootmse(pred[12:],test[i][12:18]) / np.sum(test[i][12:18]**2)
    error = error / len(test)
    return error

Cross Validation to get the best parameters for the model.

In [16]:
# CV 5 fold split
from sklearn.model_selection import KFold
kfold = KFold(n_splits = 5, shuffle = True, random_state = 440)

In [17]:
def Get_best_eps_thres(data):
    error = 5
    best_eps = 50
    best_thres = 0.5
    for epsilon in np.arange(5,25,5):
        for threshold in np.arange(0.1,0.55,0.1):
            temp_error = 0
            for train_index, test_index in kfold.split(data):
                X_train = [data[index] for index in train_index]
                X_test = [data[index] for index in test_index]
                temp_error = temp_error + Get_error(X_train, X_test, threshold = threshold, epsilon = epsilon)
        if temp_error < error:
                error = temp_error
                best_eps = epsilon
                best_thres = threshold
    error = error / 5
    return best_eps, best_thres, error

With all pieces defined, we write our main function to compute average CV error and find the best parameters for our model.

In [None]:
ult_result = np.zeros((3,4))
i=0
for alpha in [0.2, 0.4, 0.6, 0.8]:

    # getting the smoothed curve for each game
    # Ordered in increasing time order.
    X1 = X_train.copy()
    smooth_timedata = []
    broken_data=[]
    for i in range(len(X1)):
        try:
            temp = smooth_values(X1,i, smoothing_level=alpha)
            if np.isnan(temp).any():
                broken_data.append(i)
            else:
                smooth_timedata.append(temp)
        except:
            broken_data.append(i)
    
    [best_eps, best_thres, error] = Get_best_eps_thres(smooth_timedata)
    ult_result[i]=[alpha, best_eps, best_thres, error]
    i=i+1

