In [43]:
import torch.nn as nn
import torch.autograd as autograd
import torch
import pandas as pd
from pathlib import Path
import numpy as np
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import random
from sklearn.metrics import roc_auc_score
from model.py import *

In [44]:
PATH = Path("./")

**reading the dataset**

In [45]:
df_train = pd.read_csv("training.csv")
df_test = pd.read_csv("test_kaggle.csv")
df_test = df_test.drop(columns = ["id"])
df_train.head(3)

Unnamed: 0,user_id,item_id,context_feature_id
0,0,28366,2
1,0,16109,2
2,0,11500,3


**Context id is related to Users. It could be assumed as Android users, Iphone users etc.**

In [47]:
df_train.context_feature_id.unique()

array([2, 3, 1, 0])

**Item id is a feature related to item. Can be considered as genre**

In [48]:
df_item_f = pd.read_csv("item_feature.csv")

**Join the item_feature to items**

In [49]:
df_train = df_train.merge(df_item_f,how="left",on = "item_id")
df_test = df_test.merge(df_item_f,how="left",on = "item_id")

In [50]:
uniq_items = df_train.item_id.unique()
uniq_user = df_train.user_id.unique()

In [None]:
context_id_mode = df_train.groupby(['user_id']).context_feature_id.agg(lambda x: pd.Series.mode(x)[0]).reset_index()
context_id_mode.head()

In [10]:
dict_user_cnt =dict(df_train.groupby(['user_id']).item_id.count())

In [11]:
max_item_new = list(range(max(uniq_items)))

**Generating the negative Users**

In [12]:
negative_users = []
for user in dict_user_cnt:
    items = np.random.choice(max_item_new, size =dict_user_cnt[user], replace= False )
    for item in items:
        negative_users.append( (user, item) )
        
df_neg = pd.DataFrame(negative_users, columns=['user_id', 'item_id'])

In [14]:
df_neg = df_neg.merge(context_id_mode, how= "left", on="user_id")
df_neg = df_neg.merge(df_item_f,how="left",on = "item_id")
df_neg.head(5)

Unnamed: 0,user_id,item_id,context_feature_id,item_feature_id
0,0,17385,2,187
1,0,16121,2,62
2,0,9438,2,2
3,0,2433,2,142
4,0,24425,2,18


In [15]:
df_train.insert(df_train.shape[1],'rating',1)
df_neg.insert(df_neg.shape[1],'rating',0)

**joining the training with negatively generated samples**

In [16]:
df = pd.concat([df_train, df_neg])

In [17]:
df.groupby('rating').count()

Unnamed: 0_level_0,user_id,item_id,context_feature_id,item_feature_id
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,970245,970245,970245,970245
1,970245,970245,970245,970245


In [19]:
df.groupby('rating').count()

Unnamed: 0_level_0,user_id,item_id,context_feature_id,item_feature_id
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,970245,970245,970245,970245
1,970245,970245,970245,970245


In [None]:
df.to_csv("new_date_with_dup_repeat.csv")

**Read the saved DATA and the test Data**

In [38]:
import torch.nn as nn
import torch.autograd as autograd
import torch
import pandas as pd
from pathlib import Path
import numpy as np
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import random
from sklearn.metrics import roc_auc_score

In [39]:
df = pd.read_csv("../../Spring/module_2/advanced_ml/Final_Project/new_date_with_dup_repeat.csv")

In [40]:
df_test = pd.read_csv("../../Spring/module_2/advanced_ml/Final_Project/test_kaggle.csv")
df_test = df_test.drop(columns = ["id"])
df_item_f = pd.read_csv("../../Spring/module_2/advanced_ml/Final_Project/item_feature.csv")
df_test = df_test.merge(df_item_f,how="left",on = "item_id")

**Splitting the Data into train and validation set**

In [41]:
msk = np.random.rand(len(df)) < 0.8
train = df[msk].reset_index(drop=True).copy()
valid = df[~msk].reset_index(drop=True).copy()

**encoding the ids**

In [42]:
def encode_feature(train, valid, test, encode_dict, feature):
    train[feature] = train[feature].apply(lambda x: encode_dict[x])
    valid[feature] = valid[feature].apply(lambda x: encode_dict.get(x, 0))
    test[feature] = test[feature].apply(lambda x: encode_dict.get(x, 0))
    return train, valid, test

In [43]:
# User id
train_user_ids = np.sort(np.unique(train.user_id.values))
userid2idx = {o:i+1 for i,o in enumerate(train_user_ids)}
train, valid, df_test = encode_feature(train, valid, df_test, userid2idx, "user_id")

# Item_id Encoding
train_item_ids = np.sort(np.unique(train.item_id.values))
itemid2idx = {o:i+1 for i,o in enumerate(train_item_ids)}
train, valid, df_test = encode_feature(train, valid, df_test, itemid2idx, "item_id")

#Item Feature id encoding
train_item_feature_ids = np.sort(np.unique(train.item_feature_id.values))
featureid2idx = {o:i for i,o in enumerate(train_item_feature_ids)}
train, valid, df_test = encode_feature(train, valid, df_test, featureid2idx, "item_feature_id")

#Item Context id encoding 
train_context_feature_ids = np.sort(np.unique(train.context_feature_id.values))
contextid2idx = {o:i for i,o in enumerate(train_context_feature_ids)}
train, valid, df_test = encode_feature(train, valid, df_test, contextid2idx, "context_feature_id")

## Create DataLoader

In [44]:
class Dataset():
    def __init__(self, x1, x2,x3,x4, y):
        self.x1 = torch.LongTensor(x1) 
        self.x2 = torch.LongTensor(x2)
        self.x3 = torch.LongTensor(x3)
        self.x4 = torch.LongTensor(x4)
        self.y = torch.FloatTensor(y)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self,idx):
        return self.x1[idx], self.x2[idx], self.x3[idx], self.x4[idx],self.y[idx]

In [45]:
train_ds = Dataset(train.user_id.values,train.item_id.values,\
                                train.item_feature_id.values,train.context_feature_id.values, train.rating.values)

valid_ds = Dataset(valid.user_id.values,valid.item_id.values,\
                                valid.item_feature_id.values,valid.context_feature_id.values, valid.rating.values)

In [46]:
train_dl = DataLoader(train_ds, batch_size = 15000, shuffle = True)
valid_dl = DataLoader(valid_ds, batch_size = 15000, shuffle = False)

## Creating training loop

In [47]:
# here we are not using data loaders because our data fits well in memory
def train_epocs(model,train_dl, valid_dl, epochs, optimizer,scheduler):
    
    for i in range(epochs):
        print(f"epoch no: {i}")
        losses = []
        model.train()
        for users, items, item_feature, context_id, ratings in train_dl:
            msk = np.random.rand(users.shape[0]) < 0.70
            users[~msk] = 0
            y_hat = model(users, items, item_feature, context_id)
            loss = F.binary_cross_entropy_with_logits(y_hat.float(), ratings.unsqueeze(1).float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        scheduler.step()
        
        #print(losses)
        train_loss = np.mean(losses)
        valid_loss,valid_auc = val_metrics(model, valid_dl)
        print("train loss %.3f valid loss %.3f auc-roc %.3f" % (train_loss, valid_loss, valid_auc))

## Create Validation loop

In [48]:
def val_metrics(model, valid_dl):
    model.eval()
    losses = []
    y_hats = []
    ys = []
    for users, items, item_feature, context_id, ratings in valid_dl:
        y_hat = model(users, items, item_feature, context_id)
        loss = F.binary_cross_entropy_with_logits(y_hat.float(), ratings.unsqueeze(1).float())
        y_hats.append(y_hat.detach().numpy())
        ys.append(ratings.numpy())
        losses.append(loss.item())
    ys = np.concatenate(ys)
    y_hats = np.concatenate(y_hats)
    #print(y_hats.shape,ys.shape )
    return np.mean(losses), roc_auc_score(ys, y_hats)

In [49]:
num_users = len(train_user_ids)
num_items = len(train_item_ids)
num_feature = len(df.item_feature_id.unique())
num_context = len(df.context_feature_id.unique())

In [50]:
model = MF(num_users+1, num_items+1,num_feature,num_context, emb_size = 75, emb_extra = 50,
           layer_size_1 = 25,layer_size_2 = 5,frac = 0.15)

In [36]:
from torch.optim.lr_scheduler import StepLR,ExponentialLR
learning_rate = 0.005
wd=0.5
epochs = 6
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate,weight_decay=wd)
scheduler = ExponentialLR(optimizer, gamma=0.9)

In [37]:
train_epocs(model,train_dl, valid_dl, epochs, optimizer,scheduler)

epoch no: 0
train loss 0.448 valid loss 0.321 auc-roc 0.935
epoch no: 1
train loss 0.332 valid loss 0.314 auc-roc 0.938
epoch no: 2
train loss 0.299 valid loss 0.314 auc-roc 0.940
epoch no: 3
train loss 0.260 valid loss 0.316 auc-roc 0.944
epoch no: 4
train loss 0.235 valid loss 0.317 auc-roc 0.947
epoch no: 5
train loss 0.217 valid loss 0.325 auc-roc 0.948


In [38]:
user_ = torch.LongTensor(df_test.user_id.values) # .cuda()
item_ = torch.LongTensor(df_test.item_id.values) #.cuda()
feature_ = torch.LongTensor(df_test.item_feature_id.values) # .cuda()
context_ = torch.LongTensor(df_test.context_feature_id.values) #.cuda()

y_hat = model(user_, item_, feature_, context_)
prob = pd.Series(torch.sigmoid(y_hat).flatten().detach().numpy()).reset_index().rename(columns={"index":'id',0:"rating"})
prob.to_csv("submission_27",index=False)