In [1]:
# package
%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'
import torch
import torch.autograd as autograd 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# data
train=pd.read_csv('training.csv')
train['flag'] = 1
train.head()

Unnamed: 0,user_id,item_id,context_feature_id,flag
0,0,28366,2,1
1,0,16109,2,1
2,0,11500,3,1
3,0,20750,2,1
4,0,8759,2,1


# Sample strategy

#### Randomly choose negative sample + use possibility choose context_feature_id

In [54]:
# possibility of context_feature_id
from collections import Counter
context_freq = Counter(train.context_feature_id)
for k in context_freq:
    context_freq[k] = (context_freq[k]/len(train.context_feature_id))

    
# random choose from whole data
n_sample = 2 * len(train)

seen = set([(u, i) for u, i in zip(train.user_id, train.item_id)])
d = []
while n_sample > 0:
    u_id = int(np.random.randint(num_users, size=1))
    i_id = int(np.random.randint(num_items, size=1))
    if (u_id, i_id) not in seen:
        c_id = int(np.random.choice(list(context_freq.keys()), 1, p=list(context_freq.values())))
        df2 = {'user_id': u_id, 'item_id': i_id, 'context_feature_id': c_id , 'flag': 0}
        d.append(df2)
        n_sample -= 1

train = train.append(d,ignore_index=True)
train.to_csv('train_ram_sample_2.csv')

#### Calculate item frequency and assign sampling probability based on inverse of frequency

In [5]:
from collections import Counter
item_freq = Counter(train.item_id)
num_items = len(train.item_id)

# get the iverse freq
for k in item_freq:
    item_freq[k] = (item_freq[k]/num_items)

item_freq = sorted(item_freq.items(), key=lambda item: (item[1]), reverse=True)
item_freq = dict(item_freq)
keys = list(item_freq.keys())
values = list(item_freq.values())[::-1]

item_freq_up = dict(zip(keys, values))

In [6]:
unq_user_0 = list(train.user_id.unique())
negative_picks = []

for _ in unq_user_0:
    pick = np.random.choice(list(item_freq_up.keys()), 5, p=list(item_freq_up.values()))
    negative_picks.append(pick)

negative_picks_dict = dict(zip(unq_user_0, negative_picks))

# negative sampling
d = []
for k in negative_picks_dict:
    for i in negative_picks_dict[k]:
        df2 = {'user_id': k, 'item_id': i, 'flag': 0}
        d.append(df2)

train = train.append(d,ignore_index=True)
train.to_csv('train_sample.csv')

# Training-1: using all 4 features

In [3]:
data = pd.read_csv('train_ram_sample.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
data.head()

Unnamed: 0,user_id,item_id,context_feature_id,flag
0,0,28366,2,1
1,0,16109,2,1
2,0,11500,3,1
3,0,20750,2,1
4,0,8759,2,1


In [4]:
item_feature = pd.read_csv('item_feature.csv')
data = data.merge(item_feature, on = 'item_id')
data.head()

Unnamed: 0,user_id,item_id,context_feature_id,flag,item_feature_id
0,0,28366,2,1,7
1,1731,28366,1,1,7
2,10168,28366,2,1,7
3,18883,28366,1,1,7
4,19763,28366,2,1,7


In [5]:
data.isna().sum()

user_id               0
item_id               0
context_feature_id    0
flag                  0
item_feature_id       0
dtype: int64

In [6]:
Counter(data.flag)

Counter({1: 970245, 0: 1940490})

##### encode user_id and item_id for embedding

In [7]:
data['user_encoding']=data['user_id'].astype('category').cat.codes
data['item_encoding']=data['item_id'].astype('category').cat.codes

data.head()

Unnamed: 0,user_id,item_id,context_feature_id,flag,item_feature_id,user_encoding,item_encoding
0,0,28366,2,1,7,0,28366
1,1731,28366,1,1,7,1731,28366
2,10168,28366,2,1,7,10168,28366
3,18883,28366,1,1,7,18883,28366
4,19763,28366,2,1,7,19763,28366


In [8]:
encoding_user_dict=dict(zip(data.user_id, data.user_encoding))
encoding_item_dict=dict(zip(data.item_id, data.item_encoding))

##### embedding based on encoding

In [9]:
uni_user = train.user_id.unique()
uni_item = train.item_id.unique()
uni_context = train.context_feature_id.unique()

num_users = int(max(data.user_encoding)) + 2
num_items = int(max(data.item_encoding)) + 2
num_context = int(max(data.context_feature_id.unique()) + 1)
num_features = len(data.item_feature_id.unique()) 
print('For embedding (num_users, num_items, num_context,num_features): ', num_users, num_items, num_context,num_features)



For embedding (num_users, num_items, num_context,num_features):  200152 39902 4 195


In [10]:
int(max(data.user_encoding))

200150

In [11]:
change1 = data.sample(100).index
data.loc[change1,'user_encoding'] = 200151

In [12]:
change2 = data.sample(100).index
data.loc[change2,'item_encoding'] = 39901

In [13]:
def split_sets(data, train_val_ratio, seed=4):
    if isinstance(data, pd.DataFrame):
        train, val = train_test_split(data, 
                                      train_size=train_val_ratio[0], 
                                      random_state=seed,
                                      shuffle=True)
        val = val.reset_index(drop=True)
        train.reset_index(drop=True)
        return train, val

df_train, df_val = split_sets(data, train_val_ratio=(0.85, 0.15))

##### model

In [14]:
# model with context
class model_logistic_mf(nn.Module):
    def __init__(self, num_users, num_items, num_context,num_features, emb_size=40, drop=0.5):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        
        self.item_feature_emb = nn.Embedding(num_features, emb_size)
        self.context_emb = nn.Embedding(num_context, emb_size)
        
        
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.context_emb.weight.data.uniform_(0,0.05)
        self.item_feature_emb.weight.data.uniform_(0,0.05)
        
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
        self.dropout = nn.Dropout(drop)
        self.relu = nn.ReLU()

    def forward(self, u, v, w,z):
        U = self.user_emb(u)
        V = self.item_emb(v)
        W = self.context_emb(w)
        Z = self.item_feature_emb(z)
        X = torch.cat((U, V, W, Z), 1)
        X = self.relu(X)
        b = self.dropout(self.user_bias(u).squeeze())
        c = self.dropout(self.item_bias(v).squeeze())
        X = torch.sigmoid((X).sum(1) + b + c)
        #print('U.shape:', U.shape, 'V.shape', V.shape, 'W.shape: ', W.shape, 'Z.shape: ', Z.shape)
        return X

In [15]:
# dataset and dataloader
from torch.utils.data import Dataset, DataLoader

class MFDataset(Dataset):
    def __init__(self, df):
        self.u = torch.LongTensor(df.user_encoding.values)
        self.v = torch.LongTensor(df.item_encoding.values)
        self.w = torch.LongTensor(df.context_feature_id.values)
        self.z = torch.LongTensor(df.item_feature_id.values)
        self.y = torch.FloatTensor(df.flag.values)
        # x = torch.from_numpy(x).unsqueeze(1)
        # y = torch.from_numpy(y).unsqueeze(1)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.u[idx], self.v[idx], self.w[idx], self.z[idx],self.y[idx]


mf_train = MFDataset(df_train)
mf_val = MFDataset(df_val)

train_dl = DataLoader(mf_train, 
                      batch_size=10000, 
                      shuffle=True)

In [16]:
# validation
def valid_loss(model, val):
    model.eval()
    users = torch.LongTensor(val.user_encoding.values)
    items = torch.LongTensor(val.item_encoding.values)
    contexts = torch.LongTensor(val.context_feature_id.values)
    item_features = torch.LongTensor(val.item_feature_id.values)
    ratings = torch.FloatTensor(val.flag.values)
    y_hat = model(users, items, contexts,item_features)
    loss = F.binary_cross_entropy(y_hat, ratings)
    y_pred_class = (y_hat>0.5).float()
    acc = (y_pred_class.eq(ratings).sum())/float(ratings.shape[0])
    return loss.item(), acc

In [17]:
# training
def train_epocs(model, epochs=10, lr=0.01,wd=1e-5):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr,weight_decay=wd)
    
    for i in range(epochs):
        # add cross validation here if need
        
        model.train()
        train_loss = []
        n_b = 0
        for users, items, contexts,item_features,ratings in train_dl:
            y_hat = model(users, items, contexts,item_features)
            loss = F.binary_cross_entropy(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()   
            optimizer.step()
            n_b += 1
            train_loss.append(loss.item())
            
        val_loss, acc = valid_loss(model, df_val)
        print("train loss %.3f" % (np.sum(train_loss) / n_b)) 
        print("val loss %.3f" % (val_loss))
        print("val acc %.3f" % (acc))

##### Hyperparams Exploration

In [18]:
# best 1: model_mf, 15, 0.001,1e-6,  emb_size=70, drop=0.5

In [19]:
#best 2: (model_mf, 17, 0.001,1e-6) emb_size=70, drop=0.5

In [20]:
#best 3: model_mf = model_logistic_mf(num_users, num_items, num_context,num_features, emb_size=50, drop=0.4)
#train_epocs(model_mf, 10, 0.002,1e-6)

##### Best hyperparameters: 
* emb_size=50
* drop=0.4
* epochs=10
* learnig rate =0.002
* weight decay= 1e-6

In [22]:
model_mf = model_logistic_mf(num_users, num_items, num_context,num_features, emb_size=50, drop=0.4)
train_epocs(model_mf, 2, 0.002,1e-6)

train loss 0.722
val loss 0.547
val acc 0.863
train loss 0.524
val loss 0.508
val acc 0.881


# Testing Model 1

In [20]:
# submit
test = pd.read_csv('test_kaggle.csv')
test.head()

Unnamed: 0,id,user_id,item_id,context_feature_id
0,0,4,16835,2
1,1,4,22590,3
2,2,4,1978,1
3,3,4,28916,1
4,4,4,14427,2


In [21]:
item_feature = pd.read_csv('item_feature.csv')
test = test.merge(item_feature, on = 'item_id')
test.head()

Unnamed: 0,id,user_id,item_id,context_feature_id,item_feature_id
0,0,4,16835,2,142
1,434,188,16835,1,142
2,48540,25437,16835,1,142
3,51161,26834,16835,0,142
4,73056,38465,16835,2,142


In [22]:
num_users = int(max(data.user_encoding)) 
num_items = int(max(data.item_encoding)) 
print(num_users,num_items)

200151 39901


In [23]:
test['user_encoding']=test['user_id'].map(encoding_user_dict)

In [24]:
test['user_encoding']=test['user_encoding'].fillna(num_users)

In [25]:
test['item_encoding']=test['item_id'].map(encoding_item_dict)

In [26]:
test['item_encoding']=test['item_encoding'].fillna(num_items)

In [27]:
test.isna().sum()

id                    0
user_id               0
item_id               0
context_feature_id    0
item_feature_id       0
user_encoding         0
item_encoding         0
dtype: int64

In [28]:
test.head()

Unnamed: 0,id,user_id,item_id,context_feature_id,item_feature_id,user_encoding,item_encoding
0,0,4,16835,2,142,4.0,16835
1,434,188,16835,1,142,188.0,16835
2,48540,25437,16835,1,142,25437.0,16835
3,51161,26834,16835,0,142,26834.0,16835
4,73056,38465,16835,2,142,38465.0,16835


In [29]:
test.head(40)

Unnamed: 0,id,user_id,item_id,context_feature_id,item_feature_id,user_encoding,item_encoding
0,0,4,16835,2,142,4.0,16835
1,434,188,16835,1,142,188.0,16835
2,48540,25437,16835,1,142,25437.0,16835
3,51161,26834,16835,0,142,26834.0,16835
4,73056,38465,16835,2,142,38465.0,16835
5,75595,39971,16835,2,142,39971.0,16835
6,86022,45643,16835,2,142,45643.0,16835
7,86798,46083,16835,1,142,46083.0,16835
8,87547,46442,16835,1,142,46442.0,16835
9,144782,76237,16835,0,142,76237.0,16835


In [30]:
test[test['user_encoding']==200151]

Unnamed: 0,id,user_id,item_id,context_feature_id,item_feature_id,user_encoding,item_encoding
9410,289010,152378,34572,2,142,200151.0,34572
10749,289007,152378,400,2,142,200151.0,400
44649,163797,86372,12977,2,36,200151.0,12977
45306,163800,86372,16114,1,36,200151.0,16114
47593,289009,152378,5619,2,148,200151.0,5619
108455,289018,152378,10732,2,142,200151.0,10732
108853,289014,152378,34941,2,142,200151.0,34941
132435,289016,152378,30330,2,142,200151.0,30330
140924,289008,152378,15608,2,180,200151.0,15608
142332,289013,152378,39335,2,126,200151.0,39335


In [32]:
users = torch.LongTensor(test.user_encoding.values)  
items = torch.LongTensor(test.item_encoding.values)
contexts = torch.LongTensor(test.context_feature_id.values)
item_features = torch.LongTensor(test.item_feature_id.values)
y_hat = model_mf(users, items, contexts,item_features)
output_dict = {'id':pd.Series(test.id.values),'rating':pd.Series(y_hat.detach().numpy())}
output = pd.DataFrame(output_dict).set_index('id')
y_hat

tensor([0.5908, 0.6356, 0.7094,  ..., 0.2071, 0.2200, 0.2322],
       grad_fn=<SigmoidBackward0>)

In [33]:
y_hat[:40]

tensor([0.5908, 0.6356, 0.7094, 0.6114, 0.5971, 0.5916, 0.5735, 0.6189, 0.6220,
        0.6089, 0.6000, 0.6250, 0.6532, 0.6168, 0.6200, 0.5748, 0.6361, 0.7260,
        0.5900, 0.5856, 0.5836, 0.6415, 0.6000, 0.6213, 0.5910, 0.2774, 0.3093,
        0.2918, 0.2933, 0.2670, 0.2952, 0.2828, 0.3014, 0.3023, 0.2696, 0.2944,
        0.3547, 0.2713, 0.8501, 0.8740], grad_fn=<SliceBackward0>)

In [34]:
output.to_csv('sumission_26.csv')

# Training-2: using user_id,item_id,context_feature_id

In [27]:
train=pd.read_csv('training.csv')
train['flag'] = 1
train.head()

Unnamed: 0,user_id,item_id,context_feature_id,flag
0,0,28366,2,1
1,0,16109,2,1
2,0,11500,3,1
3,0,20750,2,1
4,0,8759,2,1


In [28]:
uni_user = train.user_id.unique()
uni_item = train.item_id.unique()
uni_context = train.context_feature_id.unique()

num_users = max(train.user_id.unique()) + 1
num_items = max(train.item_id.unique()) + 1
num_context = max(train.context_feature_id.unique()) + 1
print('For embedding (num_users, num_items, num_context): ', num_users, num_items, num_context)

For embedding (num_users, num_items, num_context):  200153 39901 4


In [29]:
def split_sets(data, train_val_ratio, seed=4):
    if isinstance(data, pd.DataFrame):
        train, val = train_test_split(data, 
                                      train_size=train_val_ratio[0], 
                                      random_state=seed,
                                      shuffle=True)
        val = val.reset_index(drop=True)
        train.reset_index(drop=True)
        return train, val

df_train, df_val = split_sets(data, train_val_ratio=(0.85, 0.15))

##### model

In [30]:
# model with context
class model_logistic_mf(nn.Module):
    def __init__(self, num_users, num_items, num_context, emb_size=40, drop=0.5):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.context_emb = nn.Embedding(num_context, emb_size)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.context_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        self.dropout = nn.Dropout(drop)
        self.relu = nn.ReLU()

    def forward(self, u, v, w):
        U = self.user_emb(u)
        V = self.item_emb(v)
        W = self.context_emb(w)
        X = torch.cat((U, V, W), 1)
        X = self.relu(X)
        b = self.dropout(self.user_bias(u).squeeze())
        c = self.dropout(self.item_bias(v).squeeze())
        X = torch.sigmoid((X).sum(1) + b + c)
        return X

In [31]:
# dataset and dataloader
from torch.utils.data import Dataset, DataLoader

class MFDataset(Dataset):
    def __init__(self, df):
        self.u = torch.LongTensor(df.user_id.values)
        self.v = torch.LongTensor(df.item_id.values)
        self.w = torch.LongTensor(df.context_feature_id.values)
        self.y = torch.FloatTensor(df.flag.values)
        # x = torch.from_numpy(x).unsqueeze(1)
        # y = torch.from_numpy(y).unsqueeze(1)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.u[idx], self.v[idx], self.w[idx], self.y[idx]


mf_train = MFDataset(df_train)
mf_val = MFDataset(df_val)

train_dl = DataLoader(mf_train, 
                      batch_size=10000, 
                      shuffle=True)

In [32]:
# validation
def valid_loss(model, val):
    model.eval()
    users = torch.LongTensor(val.user_id.values)
    items = torch.LongTensor(val.item_id.values)
    contexts = torch.LongTensor(val.context_feature_id.values)
    ratings = torch.FloatTensor(val.flag.values)
    y_hat = model(users, items, contexts)
    loss = F.binary_cross_entropy(y_hat, ratings)
    y_pred_class = (y_hat>0.5).float()
    acc = (y_pred_class.eq(ratings).sum())/float(ratings.shape[0])
    return loss.item(), acc

In [36]:
# training
def train_epocs(model, epochs=10, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    for i in range(epochs):
        # add cross validation here if need
        
        model.train()
        train_loss = []
        n_b = 0
        for users, items, contexts, ratings in train_dl:
            y_hat = model(users, items, contexts)
            loss = F.binary_cross_entropy(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()   
            optimizer.step()
            n_b += 1
            train_loss.append(loss.item())
            
        val_loss, acc = valid_loss(model, df_val)
        print("train loss %.3f" % (np.sum(train_loss) / n_b)) 
        print("val loss %.3f" % (val_loss))
        print("val acc %.3f" % (acc))

In [37]:
model_mf = model_logistic_mf(num_users, num_items, num_context, emb_size=40, drop=0.5)
train_epocs(model_mf, 5, 0.01)

train loss 0.656
val loss 0.453
val acc 0.882
train loss 0.439
val loss 0.390
val acc 0.886
train loss 0.408
val loss 0.358
val acc 0.886
train loss 0.394
val loss 0.339
val acc 0.886
train loss 0.385
val loss 0.327
val acc 0.886


# Testing Model 2

In [None]:
test = pd.read_csv('test_kaggle.csv')
test.head()

In [None]:
users = torch.LongTensor(test.user_id.values)  
items = torch.LongTensor(test.item_id.values)
y_hat = model_final(users, items)
output_dict = {'id':pd.Series(test.id.values),'rating':pd.Series(y_hat.detach().numpy())}
output = pd.DataFrame(output_dict).set_index('id')
y_hat

In [None]:
output.to_csv('sumission_18.csv')