In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import KNNImputer

In [2]:
df_infobase = pd.read_csv("../../data/raw/info_base_games.csv",low_memory=False)
df_gamalytic = pd.read_csv("../../data/raw/gamalytic_steam_games.csv",low_memory=False)
df_infobase.dtypes
df_gamalytic.dtypes
print(df_infobase.shape)
print(df_gamalytic.shape)

(99167, 10)
(93338, 6)


In [3]:
common_column = 'appid'
df_gamalytic.rename(columns={'steamId': common_column}, inplace=True)
# convert data type object to number for column appid in infobase 
#df_infobase[common_column] = df_infobase[common_column].astype(int) # I assume it's primary key so it will not have any null val
# First Remove any rows if appid is not numeric
df_infobase = df_infobase[pd.to_numeric(df_infobase[common_column], errors='coerce').notna()] 
df_infobase[common_column] = df_infobase[common_column].astype(int)
print(df_infobase.dtypes)
print(df_infobase.shape)
def merge_steam_games(x,y):
    df = pd.merge(x,y,on=common_column, how='inner')
    return df 
df = merge_steam_games(df_infobase,df_gamalytic).copy()

appid                   int64
name                   object
metacritic             object
steam_achievements       bool
steam_trading_cards      bool
workshop_support         bool
genres                 object
achievements_total     object
release_date           object
supported_platforms    object
dtype: object
(99166, 10)


In [4]:
df
print(df.shape)
df.describe()

(69428, 15)


Unnamed: 0,appid,price,copiesSold,reviewScore,aiContent
count,69428.0,69428.0,69428.0,69428.0,0.0
mean,1742130.0,7.940826,85849.33,71.903843,
std,927345.5,15.53368,1820168.0,28.682102,
min,10.0,0.0,0.0,0.0,
25%,962625.0,0.99,60.0,60.0,
50%,1612815.0,4.99,457.0,80.0,
75%,2581562.0,9.99,3955.0,94.0,
max,3515040.0,1900.0,302158000.0,100.0,


In [None]:
#df_infobase.head(10)
#df_infobase.tail(10)
#df_infobase.dtypes
#df_infobase.shape 
#df_infobase.info()
df_infobase.describe()


In [None]:
df_infobase.isnull().sum()
#print(((df_infobase.isnull().sum())/df_infobase.shape[0]) * 100)

In [None]:
# FOR preprocessing release date 
#df_infobase['release_date'].isnull().sum()
#df_infobase[df_infobase['release_date'].isnull()]
#df_infobase['release_date'] = pd.to_datetime(df_infobase['release_date'])
#df_infobase['release_date']
#df_infobase['release_date'] = df_infobase['release_date'].apply(lambda x: pd.to_datetime(x, infer_datetime_format=True).year)
# To be announced , Coming soon , year , dd-mm-yy , Q1 yyyy , dd-Mon

In [None]:
print("Before Preprocessing: ",df_infobase['release_date'].isnull().sum())

def preprocess_release_date(x):
    x = str(x)
    if re.search(r'(\d{1,2}) .*? ([A-Za-z]{3}) .*? (\d{4} | \d{2})', x):
        day, month, year = re.search(r'(\d{1,2}) .*? ([A-Za-z]{3}) .*? (\d{4} | \d{2})', x).groups()
        return f"{day} {month} {year}"
    elif re.search(r'([A-Za-z]{3}) .*? (\d{2}|\d{4})', x):
        month, year = re.search(r'([A-Za-z]{3}) .*? (\d{2}|\d{4})', x).groups()
        return f"1 {month} {year}"
    elif re.search(r'\b\d{1,2}-[A-Za-z]{3}-\d{2}\b', x):
        return x.replace('-', '')
    elif re.search(r'\b\d{1,2}-[A-Za-z]{3}\b', x):
        return np.nan
    elif re.search(r'\b[\d]{1,2} [A-Za-z]{3}, [\d]{4}\b', x):
        return x.replace(',', '')
    elif re.search(r'\bQ[1-4] \d{4}\b', x):
        q, y = x.split()
        quarter_map = {
            'Q1': '01 Jan',
            'Q2': '01 Apr',
            'Q3': '01 Jul',
            'Q4': '01 Oct',
        }
        return f"{quarter_map[q]} {y}"
    elif re.search(r'\b[\d]{4}\b', x):
        return "1 JAN " + str(x)
    elif x.lower() == 'to be announced':
        return np.nan
    elif x.lower() == 'coming soon':
        return np.nan
    elif x == '':
        return np.nan
    else:
        return np.nan
        #print(x)

# for each value in column release date we will apply this function to it 
df_infobase['release_date'] = df_infobase['release_date'].apply(preprocess_release_date)

#print("After Preprocessing: ",df_infobase['release_date'].isnull().sum())
#print(df_infobase.shape[0])
#df_infobase['release_date'].sample(100)


In [None]:
#df_infobase['release_date'] = pd.to_datetime(df_infobase['release_date'], format='%d %b %Y', errors='coerce')
#df_infobase['release_date'] = df_infobase['release_date'].apply(lambda x: pd.to_datetime(x, infer_datetime_format=True).year)


df_infobase['release_date'] = pd.to_datetime(df_infobase['release_date'], errors='coerce')
df_infobase['is_release_date_known'] = df_infobase['release_date'].notna().astype(int)
df_infobase['year'] = df_infobase['release_date'].dt.year.fillna(0).astype(int)
fraction_of_year = np.where(df_infobase['is_release_date_known'], (df_infobase['release_date'].dt.dayofyear - 1) / 365, 0)
df_infobase['fraction_sin'] = np.sin(2 * np.pi * fraction_of_year)
df_infobase['fraction_cos'] = np.cos(2 * np.pi * fraction_of_year)
df_infobase.drop('release_date',axis=1,inplace=True)


In [None]:
#df_infobase['release_date'].isnull().sum()
#df_infobase['release_date'].describe().T
#df_infobase['release_date'].info()
#df_infobase['release_date'].head(1000)
df_infobase
#df_infobase.head(100)

I have mutliple approaches 

First Approach 
just consider only "year" column and ignore day and month 

second approach 

Create new column for binary feature is_release_date_known
1 indicates a specific date or quarter is known.
0 indicates the entry is "to be announced" or "coming soon".
For the main release date feature (fractional_year) = year + (day of year - 1)/days in year
```py
quarter_map = {
    'Q1': 0.125,
    'Q2': 0.375,
    'Q3': 0.625,
    'Q4': 0.875
}

df['release_date'] = pd.to_datetime(df['release_date'])

df['fractional_year'] = df['release_date'].dt.year + (df['release_date'].dt.dayofyear - 1) / (365 + df['release_date'].dt.is_leap_year.astype(int))

df['fractional_year'] = df['release_date'].apply(lambda x: int(x[:4]) + quarter_map[x[5:]] if isinstance(x, str) and x.startswith('Q') else x)
```

Third Approach 
guide : https://medium.com/%40paghadalsneh/handling-date-and-time-data-in-machine-learning-a-comprehensive-guide-5d30141cbfec 

splitting date into 3 features (day,month,year)

Fourth Approach 
Cyclical encoding 



In [None]:
# Preprocessing for achievements total 
#df_infobase['achievements_total'].fillna("0",inplace=True)
df_infobase['achievements_total'] = pd.to_numeric(df_infobase['achievements_total'], errors='coerce').fillna(0).astype(int)
#df_infobase[[df_infobase['achievements_total'] == None]] = 0
df_infobase['achievements_total'].isnull().sum()
df_infobase['steam_achievements'] = (df_infobase['achievements_total'] != 0).astype(int)
# all nulls be zero 
#imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
#df_infobase['achievements_total'] = imputer.fit_transform(df_infobase['achievements_total'])

df_infobase