In [1052]:
import os
import collections
import time 
import tqdm
from functools import partial
train_on_gpu = True

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import preprocessing
from torch.autograd import Variable

import torch
from torch.utils.data import TensorDataset, DataLoader,Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR

import segmentation_models_pytorch as smp

In [1053]:
path = 'files'
os.listdir(path)

['samplesubmission.csv', 'test.csv', 'train.csv']

In [1054]:
train = pd.read_csv(f'{path}/train.csv')
test = pd.read_csv(f'{path}/test.csv')
sub = pd.read_csv(f'{path}/samplesubmission.csv')
train.head()

Unnamed: 0,id,playtime_forever,is_free,price,genres,categories,tags,purchase_date,release_date,total_positive_reviews,total_negative_reviews
0,0,0.0,False,3700.0,"Adventure,Casual,Indie","Single-player,Steam Trading Cards,Steam Cloud","Indie,Adventure,Story Rich,Casual,Atmospheric,...","Jul 2, 2018","10 Dec, 2013",372.0,96.0
1,1,0.016667,True,0.0,RPG,"Single-player,Partial Controller Support","Mod,Utilities,RPG,Game Development,Singleplaye...","Nov 26, 2016","12 Aug, 2015",23.0,0.0
2,2,0.0,False,5000.0,"Adventure,Casual,Indie","Single-player,Full controller support,Steam Tr...","Point & Click,Adventure,Story Rich,Comedy,Indi...","Jul 2, 2018","28 Jan, 2014",3018.0,663.0
3,3,1.533333,False,9900.0,"Action,RPG","Single-player,Multi-player,Steam Achievements,...","Medieval,RPG,Open World,Strategy,Sandbox,Actio...","Nov 28, 2016","31 Mar, 2010",63078.0,1746.0
4,4,22.333333,False,4800.0,"Action,Indie,Strategy","Single-player,Co-op,Steam Achievements,Full co...","Tower Defense,Co-op,Action,Strategy,Online Co-...","Mar 4, 2018","30 Jul, 2012",8841.0,523.0


In [1055]:
n_train = len(train)
n_test = len(test)
print(f'There are {n_train} records in train dataset')
print(f'There are {n_test} records in test dataset')

There are 357 records in train dataset
There are 90 records in test dataset


# Preprocessing Data (Apply to both Train and Test)

In [1056]:
def divide_col_to_dict(l):
    d = {}
    for s in l:
        vl = s.split(',')
        for v in vl:
            if v in d.keys():
                d[v] += 1
            else:
                d[v] = 1
    return d
category_d = divide_col_to_dict(train.categories)
genre_d = divide_col_to_dict(train.genres)

In [1057]:
mlb = MultiLabelBinarizer()

# Fit-Transform Train
s = train.pop('categories').str.split(',')
s_test = test.pop('categories').str.split(',')

df1 = pd.DataFrame(mlb.fit_transform(s),columns=mlb.classes_).add_prefix('Category-')
df1_test = pd.DataFrame(mlb.transform(s_test),columns=mlb.classes_).add_prefix('Category-')

train = train.join(df1)
test = test.join(df1_test)


s = train.pop('genres').str.split(',')
s_test = test.pop('genres').str.split(',')

df2 = pd.DataFrame(mlb.fit_transform(s),columns=mlb.classes_).add_prefix('Genre-')
df2_test = pd.DataFrame(mlb.transform(s_test),columns=mlb.classes_).add_prefix('Genre-')

train = train.join(df2)
test = test.join(df2_test)

s = train.pop('tags').str.split(',')
s_test = test.pop('tags').str.split(',')

df3 = pd.DataFrame(mlb.fit_transform(s_test),columns=mlb.classes_).add_prefix('Tag-')
df3_test = pd.DataFrame(mlb.transform(s_test),columns=mlb.classes_).add_prefix('Tag-')

train = train.join(df3)
test = test.join(df3_test)

# # Transform Test
# s = test.pop('categories').str.split(',')
# df1 = pd.DataFrame(mlb.transform(s),columns=mlb.classes_).add_prefix('Category-')
# test = test.join(df1)
# s = test.pop('genres').str.split(',')
# df2 = pd.DataFrame(mlb.transform(s),columns=mlb.classes_).add_prefix('Genre-')
# test = test.join(df2)
# s = test.pop('tags').str.split(',')
# df3 = pd.DataFrame(mlb.transform(s),columns=mlb.classes_).add_prefix('Tag-')
# test = test.join(df3)

In [1058]:
def preprocess(df_x):
    
    # Remove ID - Not Useful
    df_x.pop('id')
    # Change is_free to numeric
    df_x.is_free = df_x.is_free.astype(int)
    
    # Replace NaNs with default values for each Column
    df_x['purchase_date'] = df_x['purchase_date'].fillna('Jan 1, 2019')
    df_x['release_date'] = df_x['release_date'].fillna('Jan 1, 2018')
    df_x['total_positive_reviews'] = df_x['total_positive_reviews'].fillna('0')
    df_x['total_negative_reviews'] = df_x['total_negative_reviews'].fillna('0')
    
    df_x['purchase_date'] = df_x['purchase_date'].str[-2:].astype(int)
    df_x['release_date'] = df_x['release_date'].str[-2:].astype(int)
    df_x['wait_till_purchase'] = df_x['purchase_date'] - df_x['release_date']
    df_x['far_from_2019'] = 19 - df_x['purchase_date']
    df_x['review_diff'] = df_x['total_positive_reviews'].astype(int) - df_x['total_negative_reviews'].astype(int)
    
    df_x = df_x.fillna(0)
    
    df_x.drop(columns=['purchase_date', 'release_date'])
    return df_x

In [1059]:
pp_train = preprocess(train)
pp_test = preprocess(test)

In [1060]:
x_without_y = pp_train.loc[:, pp_train.columns != 'playtime_forever']
x = x_without_y.values # Returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
train_normalized = pd.DataFrame(x_scaled, columns=x_without_y.columns)

x_test = pp_test.values
x_test_scaled = min_max_scaler.transform(x_test) # Do NOT "FIT"
test_normalized = pd.DataFrame(x_test_scaled, columns=pp_test.columns)

In [1061]:
y_scaler = preprocessing.MinMaxScaler()
y = pp_train.loc[:, pp_train.columns == 'playtime_forever']
y_scaled = y_scaler.fit_transform(y)
y_normalized = pd.DataFrame(y_scaled, columns=y.columns)


In [1062]:
train_normalized = pd.concat([y_normalized, train_normalized], axis=1)

In [1063]:
train_normalized

Unnamed: 0,playtime_forever,is_free,price,purchase_date,release_date,total_positive_reviews,total_negative_reviews,Category-Captions available,Category-Co-op,Category-Commentary available,...,Tag-War,Tag-Warhammer 40K,Tag-Western,Tag-World War I,Tag-World War II,Tag-Zombies,Tag-eSports,wait_till_purchase,far_from_2019,review_diff
0,0.000000,0.0,0.000231,0.75,0.538462,0.000844,0.000220,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500000,0.25,0.004189
1,0.000146,1.0,0.000000,0.25,0.692308,0.000052,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214286,0.75,0.003370
2,0.000000,0.0,0.000313,0.75,0.615385,0.006845,0.001520,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428571,0.25,0.010920
3,0.013474,0.0,0.000619,0.25,0.307692,0.143066,0.004004,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429,0.75,0.201853
4,0.196251,0.0,0.000300,0.75,0.461538,0.020052,0.001199,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429,0.25,0.030225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,0.006883,0.0,0.000550,0.50,0.769231,0.000340,0.000209,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214286,0.50,0.003487
353,0.000000,0.0,0.000425,0.75,0.692308,0.043112,0.011120,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.357143,0.25,0.049134
354,0.000000,0.0,0.000519,0.75,0.692308,0.011565,0.003942,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.357143,0.25,0.014238
355,0.000000,0.0,0.000425,0.50,0.769231,0.001628,0.000365,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214286,0.50,0.005105


In [1064]:
device = "cuda" if torch.cuda.is_available() else "cpu"

class customData(Dataset):
    def __init__(self, data):
            self.data = torch.FloatTensor(data.values.astype('float'))

    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
            target = self.data[index][0]
            data_val = self.data[index] [1:]
            return data_val,target
train_dataset = customData(train_normalized)
# test_dataset = customData(test_normalized)


train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=Test_Batch_Size, shuffle=False, **kwargs)

## Training

In [1073]:
y_raw = train_normalized.pop('playtime_forever')
y = Variable(torch.tensor(y.values))
x = Variable(torch.tensor(train_normalized.values))
x_sub = Variable(torch.tensor(test_normalized.values))

In [1065]:
len(train_normalized)

357

In [1066]:
len(test_normalized)

90

In [1067]:
train_normalized.shape

(357, 288)

In [1071]:
n_input = 287

model = nn.Sequential(nn.Linear(n_input, 512),
                      nn.ReLU(),
                      nn.Linear(512, 256),
                      nn.ReLU(),
                      nn.Linear(256, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 16),
                      nn.ReLU(),
                      nn.Linear(16, 8),
                      nn.ReLU(),
                      nn.Linear(8, 1),
                      nn.ReLU())
# Define the loss
criterion = nn.MSELoss()
# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.SGD(model.parameters(), lr=0.003)
epochs = 1000
for e in range(epochs):
    running_loss = 0
    for data, labels in train_loader:
#         print(data.shape)
        data = data.reshape(n_input,)
        
        # Training pass
        optimizer.zero_grad()
        
        output = model(data)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
#         print(labels)
#         print(loss.item())
#         print(loss)
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss/len(train_loader)}")

Training loss: 0.010250518253436671
Training loss: 0.009750240263772796
Training loss: 0.009741475006796846
Training loss: 0.009740915809117331
Training loss: 0.00974081331424289
Training loss: 0.0097407444445972
Training loss: 0.009740679778236004
Training loss: 0.009740615258497218
Training loss: 0.009740550815191214
Training loss: 0.009740486591317262
Training loss: 0.009740423219107864
Training loss: 0.009740360826236954
Training loss: 0.009740299351332348
Training loss: 0.00974023749115135
Training loss: 0.009740175450063084
Training loss: 0.009740114284507682
Training loss: 0.009740052903108701
Training loss: 0.009739992430842406
Training loss: 0.009739932489410309
Training loss: 0.009739871527524166
Training loss: 0.009739810902290027
Training loss: 0.009739750677753312
Training loss: 0.009739691295199982
Training loss: 0.00973963336927842
Training loss: 0.009739574562934481
Training loss: 0.009739515850855382
Training loss: 0.009739456789782469
Training loss: 0.0097393981854727

Training loss: 0.009728754672257927
Training loss: 0.009728701934372858
Training loss: 0.009728649482650101
Training loss: 0.009728598206099037
Training loss: 0.009728545011331748
Training loss: 0.00972849227172605
Training loss: 0.00972844067481077
Training loss: 0.009728386747732885
Training loss: 0.009728334856843556
Training loss: 0.009728282202702998
Training loss: 0.00972822887166091
Training loss: 0.009728177093558672
Training loss: 0.009728123035994978
Training loss: 0.00972807029758842
Training loss: 0.009728017680976728
Training loss: 0.009727963774813685
Training loss: 0.009727910534770269
Training loss: 0.009727858157786876
Training loss: 0.009727803329219579
Training loss: 0.009727749943095972
Training loss: 0.009727695774452589
Training loss: 0.009727641905844625
Training loss: 0.009727588168222572
Training loss: 0.00972753349266066
Training loss: 0.009727478805263128
Training loss: 0.009727425141812527
Training loss: 0.009727370014416281
Training loss: 0.0097273152350157

Training loss: 0.009712701846884842
Training loss: 0.009712602155409998
Training loss: 0.009712496380876352
Training loss: 0.009712393326229996
Training loss: 0.009712292718820072
Training loss: 0.009712185268707606
Training loss: 0.009712082344237856
Training loss: 0.009711973524711353
Training loss: 0.00971187320262574
Training loss: 0.009711763050428148
Training loss: 0.009711658118533688
Training loss: 0.009711545911503605
Training loss: 0.009711442861610005
Training loss: 0.009711331990361538
Training loss: 0.009711226215018832
Training loss: 0.009711114482906019
Training loss: 0.009711002660872213
Training loss: 0.009710899969193347
Training loss: 0.009710785475437892
Training loss: 0.009710676611625644
Training loss: 0.009710565240991302
Training loss: 0.009710449868865687
Training loss: 0.009710342436685638
Training loss: 0.009710227143785043
Training loss: 0.009710114620087832
Training loss: 0.009709996434714327
Training loss: 0.009709889116772558
Training loss: 0.009709768678

Training loss: 0.009669167064060256
Training loss: 0.009668812676211948
Training loss: 0.009668446958230038
Training loss: 0.009668089978465043
Training loss: 0.009667723515967144
Training loss: 0.009667354126478381
Training loss: 0.009666987837756222
Training loss: 0.009666615450994287
Training loss: 0.009666244762946884
Training loss: 0.009665861972940771
Training loss: 0.009665493181174793
Training loss: 0.009665102877277629
Training loss: 0.009664726618948074
Training loss: 0.009664336062570465
Training loss: 0.009663948855484735
Training loss: 0.009663559041280379
Training loss: 0.009663159287608058
Training loss: 0.009662769979439715
Training loss: 0.009662362009035122
Training loss: 0.009661965450846553
Training loss: 0.00966155562267148
Training loss: 0.009661151683515305
Training loss: 0.009660733903880418
Training loss: 0.009660327469133286
Training loss: 0.009659901531569365
Training loss: 0.00965948950466266
Training loss: 0.009659059678743448
Training loss: 0.0096586399182

Training loss: 0.009390521601044479
Training loss: 0.009386897183278564
Training loss: 0.009383207688559556
Training loss: 0.009379455434701724
Training loss: 0.009375654119657661
Training loss: 0.009371786911236786
Training loss: 0.009367892636616785
Training loss: 0.009363905593175058
Training loss: 0.009359864072323533
Training loss: 0.00935577154900664
Training loss: 0.009351612377992315
Training loss: 0.009347401880012587
Training loss: 0.009343121718160197
Training loss: 0.009338773584634034
Training loss: 0.009334358233502631
Training loss: 0.009329868974544514
Training loss: 0.009325311026980861
Training loss: 0.009320702658664502
Training loss: 0.009315992950249425
Training loss: 0.009311230571420092
Training loss: 0.009306306153509756
Training loss: 0.009301467447642764
Training loss: 0.009296430427954293
Training loss: 0.009291418269762033
Training loss: 0.00928618579826498
Training loss: 0.009281019279775106
Training loss: 0.009275695932039685
Training loss: 0.0092701888015

In [1074]:
with torch.no_grad(): # we don't need gradients in the testing phase
    predicted = model(x.float()).data.numpy()
    print(predicted)

[[0.01487361]
 [0.01977256]
 [0.02386915]
 [0.03869469]
 [0.04628681]
 [0.0168243 ]
 [0.01419752]
 [0.02420133]
 [0.02415469]
 [0.02431733]
 [0.01408175]
 [0.01800091]
 [0.02109153]
 [0.0630459 ]
 [0.02124502]
 [0.03265345]
 [0.0369392 ]
 [0.01691914]
 [0.01863002]
 [0.02037854]
 [0.01350024]
 [0.03195705]
 [0.02221208]
 [0.01912282]
 [0.02835802]
 [0.01523714]
 [0.02178399]
 [0.01629552]
 [0.01983758]
 [0.0204764 ]
 [0.0329297 ]
 [0.04727734]
 [0.01298501]
 [0.01208562]
 [0.01257144]
 [0.0352212 ]
 [0.08402866]
 [0.01564727]
 [0.02401432]
 [0.05974607]
 [0.01896749]
 [0.01564269]
 [0.03299399]
 [0.02309116]
 [0.01503069]
 [0.01252675]
 [0.02195788]
 [0.03050249]
 [0.01721708]
 [0.02633187]
 [0.01325379]
 [0.03095495]
 [0.02999999]
 [0.01466636]
 [0.0311795 ]
 [0.01681894]
 [0.02281575]
 [0.01890623]
 [0.02240198]
 [0.01260576]
 [0.03667091]
 [0.01974936]
 [0.01763544]
 [0.02193528]
 [0.02803263]
 [0.01434928]
 [0.02726138]
 [0.04016424]
 [0.01965771]
 [0.02169222]
 [0.01517744]
 [0.02

In [1075]:
with torch.no_grad(): # we don't need gradients in the testing phase
    predicted_test = model(x_sub.float()).data.numpy()
    print(predicted_test)

[[0.01740786]
 [0.03032091]
 [0.01594885]
 [0.03014366]
 [0.05331788]
 [0.01626626]
 [0.03207823]
 [0.01983525]
 [0.04217457]
 [0.03751401]
 [0.0275722 ]
 [0.01771516]
 [0.01594556]
 [0.01457845]
 [0.01719477]
 [0.01691267]
 [0.04648752]
 [0.02303597]
 [0.02085268]
 [0.037814  ]
 [0.01679219]
 [0.01663066]
 [0.02156051]
 [0.01937419]
 [0.0186654 ]
 [0.01894936]
 [0.01331186]
 [0.01542701]
 [0.01691868]
 [0.02294255]
 [0.02535539]
 [0.01672294]
 [0.0136435 ]
 [0.0150495 ]
 [0.01490079]
 [0.02547451]
 [0.03765033]
 [0.05400706]
 [0.02033476]
 [0.01515894]
 [0.01457579]
 [0.01714767]
 [0.0393141 ]
 [0.02072759]
 [0.01440368]
 [0.01284483]
 [0.02475997]
 [0.02028128]
 [0.01520062]
 [0.02235178]
 [0.03993414]
 [0.0153775 ]
 [0.01514895]
 [0.02370428]
 [0.02267293]
 [0.01418394]
 [0.01697118]
 [0.02170886]
 [0.01489286]
 [0.01658599]
 [0.016198  ]
 [0.02707622]
 [0.01940226]
 [0.02119032]
 [0.01794192]
 [0.01337121]
 [0.01619164]
 [0.0261727 ]
 [0.03384973]
 [0.01326247]
 [0.01790935]
 [0.01

In [1076]:
np.set_printoptions(suppress=True)
scaler = preprocessing.MinMaxScaler()
scaler.min_, scaler.scale_ = y_scaler.min_[0], y_scaler.scale_[0]

In [1077]:
pred_scaled = scaler.inverse_transform(predicted)
true_scaled = y.data.numpy()
# true_scaled = scaler.inverse_transform(y_normalized.data.numpy().reshape(357,1))
print("MSE on training set is" , mean_squared_error(true_scaled,pred_scaled))

MSE on training set is 112.47352353273475


In [1078]:
pred_test_scaled = scaler.inverse_transform(predicted_test)


In [1079]:
temp = pd.DataFrame({'true': true_scaled.reshape(357), 'pred': pred_scaled.reshape(357)}, columns=['true', 'pred'])


In [1080]:
temp.to_csv('training_result.csv')

In [1081]:
pd.DataFrame(pred_test_scaled).to_csv("result.csv")