In [1]:
import os 
import pandas as pd 
import torch 
import torch.nn as nn 
from sklearn import preprocessing

import numpy as np 
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset , DataLoader

dpath = '../ml-100k/'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
def age_map(x):
    x = int(x)
    if x < 20:
        return '10'
    elif x >= 20 and x < 30:
        return '20'
    elif x >= 30 and x < 40:
        return '30'
    elif x >= 40 and x < 50:
        return '40'
    elif x >= 50 and x < 60:
        return '50'
    else:
        return '60'

df = pd.read_csv(os.path.join(dpath,'u.data'), sep='\t', header=None)
df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
user2idx = {j:i for i,j in enumerate(df.user_id.unique())}
item2idx = {j:i for i,j in enumerate(df.item_id.unique())}

df['user_id'] = df['user_id'].map(user2idx)
df['item_id'] = df['item_id'].map(item2idx)

movies_df = pd.read_csv(os.path.join(dpath,'u.item'), sep='|', header=None, encoding='latin-1')
movies_df.columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
                    'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 
                    'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                    'Thriller', 'War', 'Western']

users_df = pd.read_csv(os.path.join(dpath,'u.user'), sep='|', encoding='latin-1', header=None)
users_df.columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']

users_df['age'] = users_df['age'].apply(age_map)

movies_df.drop(['movie_title', 'release_date', 'video_release_date', 'IMDb_URL'], axis=1, inplace=True)
movies_df['movie_id'] = movies_df['movie_id'].map(item2idx)
users_df['user_id'] = users_df['user_id'].map(user2idx)

df.rename(columns={'item_id':'movie_id'}, inplace=True)

df = pd.merge(df, movies_df,how='left', on = 'movie_id')
df = pd.merge(df, users_df, how='left',on = 'user_id')

df.drop(['timestamp', 'zip_code'], axis=1, inplace=True)
le = preprocessing.LabelEncoder() 
df['gender'] = le.fit_transform(df['gender'])
df['age'] = le.fit_transform(df['age'])
df['occupation'] = le.fit_transform(df['occupation'])
df['rating'] = [int(i/4) for i in df.rating]

In [4]:
class Wide(nn.Module):
    def __init__(self, wide_dim, output_dim):
        super(Wide, self).__init__()
        self.linear = nn.Linear(wide_dim, output_dim)
        
    def forward(self, x):
        output = self.linear(x)
        return output

In [5]:
class Deep(nn.Module):
    def __init__(self, embedding_input, factor_dim, layer_num, hidden_dim, output_dim):
        super(Deep, self).__init__()
        self.embedding_input = embedding_input
        self.factor_dim = factor_dim
        self.layer_num = layer_num
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        for idx, val in enumerate(self.embedding_input):
            setattr(self, 'embedding_{}'.format(idx), nn.Embedding(val, self.factor_dim))
        
        self.dense_layers = self.dense()
        
    def dense(self):
        dense = []
        self.factor_dim *= len(self.embedding_input)
        dense.append(nn.Linear(self.factor_dim, self.hidden_dim[0], bias= True))
        dense.append(nn.ReLU())
        for idx in range(self.layer_num-1):
            dense.append(nn.Linear(self.hidden_dim[idx], self.hidden_dim[idx+1], bias= True))
            dense.append(nn.ReLU())
        dense.append(nn.Linear(self.hidden_dim[-1], self.output_dim))
        
        return nn.Sequential(*dense)
    
    def forward(self, x):
        output = [getattr(self, 'embedding_{}'.format(idx))(x[:,idx]) for idx, val in enumerate(self.embedding_input)]
        output = torch.cat(output, 1)
        
        output = self.dense_layers(output)
        
        return output

In [6]:
class WideAndDeep(nn.Module):
    def __init__(self, wide_dim, embedding_input, factor_dim, layer_num, hidden_dim, output_dim):
        super(WideAndDeep, self).__init__()
        
        self.wide = Wide(wide_dim, output_dim)
        self.deep = Deep(embedding_input, factor_dim, layer_num, hidden_dim, output_dim)
        
    def forward(self, wide, deep):
        wide_component = self.wide(wide)
        deep_component = self.deep(deep)
        return torch.sigmoid(torch.add(wide_component, deep_component))

In [7]:
need_dummies = []
for column in df.columns:
    if df[column].nunique() > 2:
        need_dummies.append(column)

In [8]:
need_dummies

['user_id', 'movie_id', 'age', 'occupation']

In [9]:
deep_columns = df.drop(columns=['rating'],axis=1).columns

In [10]:
wide_df = pd.get_dummies(df, columns=need_dummies)

for column in need_dummies:
    wide_df[column] = df[column]

In [11]:
wide_df

Unnamed: 0,rating,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,user_id,movie_id,age,occupation
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,3,20
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,2,6
2,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,1,2,2,1,20
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,3,3,1,19
4,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,4,4,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,875,173,0,18
99996,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,708,247,2,0
99997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,37,1004,1,18
99998,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,58,443,3,3


In [30]:
class MovieLensWD(Dataset):
    def __init__(self, df, deep_columns, need_dummies):
        self.df = df 
        self.X = df.drop(['rating'], axis=1)
        
        self.deep_df = self.df[deep_columns]
        self.deep = self.deep_df.values
        
        self.wide_df = self.df.drop(need_dummies, axis=1)
        self.wide = self.wide_df.to_numpy(dtype='float32')
        
        self.y = df['rating'].values
        
    # def get_wide(self):
    #     self.wide_df = pd.get_dummies(self.X, columns=self.X.columns.drop(self.already_dummies))
    #     self.wide = self.wide_df.to_numpy(dtype='float32')
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, index):
        return self.wide[index], self.deep[index], self.y[index]
    
    def wide_dim(self):
        return len(self.wide_df.columns)
    
    def deep_dims(self):
        embedding_input = [] 
        for column in self.deep_df.columns:
            embedding_input.append(self.deep_df[column].nunique())

        return embedding_input

In [31]:
def load_data_split(df, deep_columns, need_dummies):
    
    # already_dummies = [] 

    # for column in df.drop(columns=['rating'], axis=1).columns:
    #     if df[column].nunique()==2:
    #         already_dummies.append(column)
            
    train_X, test_X= train_test_split(df, test_size=0.2, random_state=42)
    
    df_dataset = MovieLensWD(df, deep_columns, need_dummies)

    train_dataset_wd = MovieLensWD(train_X, deep_columns, need_dummies)
    test_dataset_wd = MovieLensWD(test_X, deep_columns, need_dummies)
    
    wide_dim = df_dataset.wide_dim()
    deep_dims = df_dataset.deep_dims()
    

    train_dataloader_wd = DataLoader(train_dataset_wd, batch_size=32, shuffle=True)
    test_dataloader_wd = DataLoader(test_dataset_wd, batch_size=32, shuffle=True)

    return train_dataloader_wd, test_dataloader_wd, wide_dim, deep_dims

In [32]:
train_data_loader, test_data_loader, wide_dim, deep_dims = load_data_split(wide_df, deep_columns, need_dummies)

In [34]:
device = torch.device('cpu')
model = WideAndDeep(wide_dim, deep_dims, 16, 3, [8, 4, 2], 1)
model = model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.001)
optimizer.param_groups[0]['capturable'] = True

summary = pd.DataFrame(columns=['Epoch', 'Loss', 'Test_Loss'])

for epoch in range(100):
    
    print(f'Epoch {epoch}')

    model.train()
    train_loss = 0.0
    for wide, deep, y in train_data_loader:
        wide, deep, y = wide.to(device), deep.to(device), y.to(device)
        optimizer.zero_grad()
        
        pred = model(wide, deep)
        loss = criterion(pred.squeeze(), y.to(torch.float32))
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
    train_loss /= len(train_data_loader)
    
    model.eval()
    
    test_loss = 0.0
    for wide, deep, y in test_data_loader:
        with torch.no_grad():
            wide, deep, y = wide.to(device), deep.to(device), y.to(device)
            pred = model(wide, deep)
            loss = criterion(pred.squeeze(), y.to(torch.float32))
            test_loss += loss.item()
        
    test_loss /= len(test_data_loader)
    
    print(f'Epoch {epoch} | Loss: {train_loss} | Test Loss: {test_loss}')
    
    summary = pd.concat([summary, pd.DataFrame([[epoch, train_loss, test_loss]], columns=['Epoch', 'Loss', 'Test_Loss'])])

summary.to_csv('summary.csv', index=False) 


Epoch 0
Epoch 0 | Loss: 0.6722539530277252 | Test Loss: 0.6602826360702515
Epoch 1
Epoch 1 | Loss: 0.652304321360588 | Test Loss: 0.6470566268920899
Epoch 2
Epoch 2 | Loss: 0.6408456965208054 | Test Loss: 0.6374604538917541
Epoch 3
Epoch 3 | Loss: 0.6320318527460098 | Test Loss: 0.6296242964744568
Epoch 4
Epoch 4 | Loss: 0.6246661900758743 | Test Loss: 0.622989042186737
Epoch 5
Epoch 5 | Loss: 0.6182923763275147 | Test Loss: 0.6170748338699341
Epoch 6
Epoch 6 | Loss: 0.6126140144586563 | Test Loss: 0.611783252620697
Epoch 7
Epoch 7 | Loss: 0.6074654084205627 | Test Loss: 0.6069281576156617
Epoch 8
Epoch 8 | Loss: 0.6027369277954101 | Test Loss: 0.6025013341903687
Epoch 9
Epoch 9 | Loss: 0.5983519744157791 | Test Loss: 0.5982674974441529
Epoch 10
Epoch 10 | Loss: 0.5942675374031067 | Test Loss: 0.5943491331100463
Epoch 11
Epoch 11 | Loss: 0.5904006599187851 | Test Loss: 0.5906014350891113
Epoch 12
Epoch 12 | Loss: 0.5867441610097885 | Test Loss: 0.587059663105011
Epoch 13
Epoch 13 | Los

KeyboardInterrupt: 