In [9]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn

In [2]:
df = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\vegas-restaurants\\user_cat_bool_df.pickle')

In [3]:
df = df.astype({'user_id':'category'})

In [4]:
df = df.sample(frac=0.1, random_state=1)

In [5]:
categorical_columns = list(df.columns)
categorical_columns.remove('stars')

categorical_data = np.stack([df[c].cat.codes.values for c in categorical_columns], 1)
categorical_data = torch.tensor(categorical_data, dtype=torch.int64)

outputs = torch.tensor(df['stars'].values).flatten()

In [6]:
train_data, test_data, train_output, test_output = train_test_split(categorical_data, outputs, test_size=0.2, random_state=42)

In [7]:
print(len(train_data))
print(len(train_output))
print(len(test_data))
print(len(test_output))

120307
120307
30077
30077


In [23]:
class Model(nn.Module):

    def __init__(self, embedding_size, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p)
        #self.batch_norm_num = nn.BatchNorm1d(num_numerical_cols)

        all_layers = []
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols #+ num_numerical_cols

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))

        self.layers = nn.Sequential(*all_layers)

    def forward(self, x_categorical):
        embeddings = []
        for i,e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)

        x = self.layers(x)
        return x

In [24]:
categorical_column_sizes = [len(df[column].cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_column_sizes]

In [30]:
model = Model(categorical_embedding_sizes, 5, [200,100,50], p=0.4)

In [28]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [31]:
epochs = 300
aggregated_losses = []

for i in range(epochs):
    i += 1
    y_pred = model(train_data)
    single_loss = loss_function(y_pred, train_output-1)
    aggregated_losses.append(single_loss)

    if i%25 == 1:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')

epoch:   1 loss: 1.59262788
epoch:  26 loss: 1.59236217


In [None]:
import matplotlib.pyplot as plt
plt.plot(range(epochs), aggregated_losses)
plt.ylabel('Loss')
plt.xlabel('epoch')