# Data Preparation

In this notebook, we clean our timeseries data and get them ready for analysis.

In [10]:
import pandas as pd
import numpy as np
import random

In [3]:
# Get time series data:
timedata = pd.read_csv('SteamCharts.csv',low_memory=False)

# Replace inf to NaN:
timedata = timedata.replace({np.inf:np.nan})
timedata = timedata.replace({'+Inf':np.nan})

In [4]:
# Get Avg_Players.
avg_players = timedata.loc[timedata['Avg_Players']==1].reset_index(drop=True)
avg_pure = avg_players.drop(columns=['App_id', 'Name', 'Avg_Players', 'Gain', 'Perc_Gain', 'Peak_Players',
       'Last 30 Days']).astype(float)

avg_players['max'] = avg_pure.max(axis=1)
avg_pure = avg_pure.drop(avg_players.loc[avg_players['max']<=0].index).drop(avg_players.loc[avg_players['max'].isna()==True].index)
avg_pure = avg_pure.reset_index(drop=True)

avg_players = avg_players.drop(avg_players.loc[avg_players['max']<=0].index).drop(avg_players.loc[avg_players['max'].isna()==True].index)
avg_players = avg_players.reset_index(drop=True)

birth = [avg_pure.iloc[i].last_valid_index() for i in range(len(avg_pure))]
avg_players['birth']=birth

Select games with
- max of avg_players is larger than or equal to 10,
- release time between July 2012 and March 2021, and
- no 'NaN' in between the time series. 

In [5]:
#Select games with max of avg_players >=10

avg1 = avg_players.loc[avg_players['max']>=10]
avg_pure1 = avg_pure.loc[avg_players['max']>=10]


#Select games with release time between July 2012 and March 2021

avg1 = avg1.loc[avg_players['birth']!='July 2012']
avg1 = avg1.loc[avg_players['birth']!='March 2021']
avg1 = avg1.loc[avg_players['birth']!='April 2021'].reset_index(drop=True)

avg_pure1 = avg_pure1.loc[avg_players['birth']!='July 2012']
avg_pure1 = avg_pure1.loc[avg_players['birth']!='March 2021']
avg_pure1 = avg_pure1.loc[avg_players['birth']!='April 2021'].reset_index(drop=True)


#Select games with no NaN data in between

incomplete_data = []
for i in range(len(avg1)):
    temp = avg_pure1.iloc[i]
    if np.isnan(temp[temp.first_valid_index():temp.last_valid_index()]).any():
        incomplete_data.append(i)
len(incomplete_data)

avg2 = avg1.drop(incomplete_data).reset_index(drop=True)
avg_pure2 = avg_pure1.drop(incomplete_data).reset_index(drop=True)

In [6]:
len(avg2)

5703

There are 5703 such games, and we save them into the following csv files.

In [7]:
avg2.to_csv('Average_PLayers_Cleaned.csv')
avg_pure2.to_csv('Average_PLayers_Pure_Cleaned.csv')

For our purpose of predicting **6**-month horizon using **12**-month data, we select games with at least 18-month data.

In [7]:
#dropping datas with less than 18 months
data = avg2.copy()
data_pure = avg_pure2.copy()
list1 = []
for i in range(len(data)):
    if len(data_pure.iloc[i][data_pure.iloc[i].first_valid_index():data_pure.iloc[i].last_valid_index()])<18:
        list1.append(i)
data=data.drop(index = list1).reset_index(drop=True)
data_pure=data_pure.drop(index = list1).reset_index(drop=True)

In [13]:
data.to_csv('data18m.csv')
data_pure.to_csv('data18m_pure.csv')

In [8]:
print('There are in total ' + str(len(data)) + ' games with 18-month data.')

There are in total 4289 games with 18-month data.


### Train Test Split

In [11]:
# Train test split
test_index = random.sample(range(len(data)),int(len(data)/4))
test = data.iloc[test_index]
test_pure = data_pure.iloc[test_index]
train = data.drop(test_index)
train_pure = data_pure.drop(test_index)

In [12]:
# For accuracy comparison with the basic methods, we save our train test split
train.to_csv('Training_data.csv')
train_pure.to_csv('Training_data_pure.csv')
test.to_csv('Test_data.csv')
test_pure.to_csv('Test_data_pure.csv')