#  Librerías

### Librerías pesadas
Para ejecutar solo una vez

In [1]:
import math
import torch
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np
import tqdm
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!wandb login 99217068fbd71985701543b0c0064e805ac87449

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/rafael/.netrc


### Librerías livianas
Para ejecutar múltiples veces

# Configuración General

Variables relacionadas al procesamiento de datos y del modelo en sí

### Variables de Preprocesamiento

In [3]:

# Porcentaje para usar solo una fracción del dataset de usuario.
# si al eliminar usuarios quedan viajes o POI sin visitas, estos también
# serán eliminados
USER_FRAC = 0.65
MIN_POI_VISITS = 5
MAX_SEQUENCES_PER_USER = 100
SEQUENCE_LENGTH = 14

In [4]:
BATCH_SIZE=64
EPOCHS=100

In [5]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 80

# Gowalla Dataset

In [6]:
! ./download-gowalla.sh

Already Downloaded


In [7]:
# !mkdir -p download
# !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=FILEID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=0BzpKyxX1dqTYRTFVYTd1UG81ZXc" -O download/gowalla.zip && rm -rf /tmp/cookies.txt
# !unzip download/gowalla.zip -d download

### Cargar Datos

In [8]:
users    = pd.read_csv('download/gowalla/gowalla_userinfo.csv')
friends  = pd.read_csv('download/gowalla/gowalla_friendship.csv')
checkins = pd.read_csv('download/gowalla/gowalla_checkins.csv')
pois_1   = pd.read_csv('download/gowalla/gowalla_spots_subset1.csv', encoding='iso-8859-1')
pois_2   = pd.read_csv('download/gowalla/gowalla_spots_subset2.csv', encoding='iso-8859-1')
pois     = pd.concat((pois_1, pois_2), ignore_index=True)

# Preprocesamiento

### Usuarios

Revisamos la distrubución de checkins de usuarios

In [9]:
users.sample(5)

Unnamed: 0,id,bookmarked_spots_count,challenge_pin_count,country_pin_count,highlights_count,items_count,photos_count,pins_count,province_pin_count,region_pin_count,state_pin_count,trips_count,friends_count,stamps_count,checkin_num,places_num
194855,1208544,0,1,0,0,0,0,1,0,0,0,0,9,0,0,0
1275,2640,1,11,1,0,10,5,17,0,6,5,0,13,150,246,150
249735,2135447,0,3,2,0,9,0,5,0,2,0,0,14,20,34,20
127811,286924,0,1,0,0,3,0,1,0,0,0,0,7,0,0,0
120212,266772,0,1,0,0,3,0,1,0,0,0,0,1,0,0,0


In [10]:
users.checkin_num.describe()

count    407533.000000
mean         88.341212
std         435.982581
min           0.000000
25%           1.000000
50%          10.000000
75%          52.000000
max       46981.000000
Name: checkin_num, dtype: float64

nos quedamos con un porcentaje de los usuarios y los filtramos los usuarios segun los checkins que tengan

In [11]:
users.checkin_num.describe()

count    407533.000000
mean         88.341212
std         435.982581
min           0.000000
25%           1.000000
50%          10.000000
75%          52.000000
max       46981.000000
Name: checkin_num, dtype: float64

In [12]:
print('Current users', len(users))
users = users.sample(frac=USER_FRAC)
users = users[(users.checkin_num >= users.checkin_num.quantile(0.1)) &
              (users.checkin_num <= users.checkin_num.quantile(0.9)) &
              (users.checkin_num >= SEQUENCE_LENGTH)]
users = users[['id']]
print('Reduced users', len(users))

Current users 407533
Reduced users 93826


### Amigos

In [13]:
friends.sample(5)

Unnamed: 0,userid1,userid2
3427848,1539037,2002876
1217151,94119,61035
916938,2162217,2173714
3154253,222541,29228
1907875,172501,2641236


### Checkins

In [14]:
checkins.sample(5)

Unnamed: 0,userid,placeid,datetime
31647761,748,409106,2010-04-11T16:33:40Z
29926511,2219444,6660929,2011-01-12T16:30:39Z
15537292,18675,5624004,2010-10-16T17:39:50Z
32920025,482205,3930423,2010-10-30T00:30:19Z
3635502,85982,155054,2010-03-26T17:43:05Z


Eliminamos los checkins de los usuarios no sampleados

In [15]:
print('Current checkins', len(checkins))
checkins = pd.merge(checkins, users, how='inner', left_on='userid', right_on='id', copy=False)[checkins.columns]
checkins = checkins.reset_index(drop=True)
print('Reduced checkins', len(checkins))

Current checkins 36001959
Reduced checkins 5522934


### POIS

In [16]:
pois.sample(5)

Unnamed: 0,id,created_at,lng,lat,photos_count,checkins_count,users_count,radius_meters,highlights_count,items_count,max_items_count,spot_categories,name,city_state,Unnamed: 5,Unnamed: 6
565321,633626,2010-03-02T06:36:44Z,100.496793,13.69476,0.0,20.0,11.0,75.0,0.0,0.0,10.0,"[{'url': '/categories/106', 'name': 'Grocery'}]",,,,
1722989,5683908,2010-10-17T14:24:38Z,10.473173,49.767256,0.0,11.0,11.0,75.0,0.0,1.0,10.0,"[{'url': '/categories/454', 'name': 'Subway'}]",,,,
896763,1016589,2010-04-26T06:41:37Z,112.748017,-7.320036,0.0,5.0,5.0,75.0,0.0,1.0,10.0,"[{'url': '/categories/18', 'name': 'Asian'}]",,,,
451783,504853,2010-02-04T17:30:43Z,-97.168416,35.487641,0.0,14.0,6.0,100.0,0.0,1.0,10.0,"[{'url': '/categories/152', 'name': 'High Scho...",,,,
144350,164294,2009-12-04T18:01:24Z,-2.229141,53.326767,0.0,5.0,4.0,75.0,0.0,1.0,10.0,"[{'url': '/categories/24', 'name': 'Pub'}]",,,,


Filtramos los pois si han sido visitadas pocas veces según los parámetros que definimos

In [17]:
visited_pois = pd.merge(pois, checkins, left_on='id', right_on='placeid', how='inner', copy=False)

In [18]:
visited_pois['visited_count'] = np.zeros(len(visited_pois))

visited_pois = visited_pois[['id', 'visited_count']].groupby(by='id').count()
visited_pois = visited_pois[visited_pois.visited_count >= MIN_POI_VISITS]

pois = pd.merge(pois, visited_pois, on='id', how='inner', copy=False)

Nos quedamos con sólo las columnas que nos importan

In [19]:
pois = pois[['id', 'lat', 'lng', 'visited_count']]
pois 

Unnamed: 0,id,lat,lng,visited_count
0,8932,32.927662,-97.254356,11
1,8938,39.052824,-94.590311,20
2,8947,37.331880,-122.029631,312
3,8954,32.939436,-97.106009,7
4,8956,32.942655,-97.131200,6
...,...,...,...,...
257458,7523748,27.517234,-82.727902,7
257459,7527534,13.844906,100.855976,5
257460,7529626,51.435737,-3.174222,5
257461,7533476,35.847316,-78.805891,6


Ahora eliminamos los checkins de pois que ya no existen

In [20]:
print('Current checkins', len(checkins))
checkins = pd.merge(pois, checkins, left_on='id', right_on='placeid', how='inner', copy=False)[checkins.columns]
checkins = checkins.reset_index(drop=True)
print('Reduced checkins', len(checkins))

Current checkins 5522934
Reduced checkins 3803814


Finalmente eliminamos nuevamente a los usuarios que se quedaron sin suficientes checkins

In [21]:
print('Current users', len(users))
users = pd.merge(checkins, users, how='inner', left_on='userid', right_on='id', copy=False)[users.columns].drop_duplicates()
print('Reduced users', len(users))

Current users 93826
Reduced users 93574


### Reasignación de IDs

In [22]:
users = users.reset_index(drop=True)
users['user_sid'] = users.index

pois = pois.reset_index(drop=True)
pois['place_sid'] = pois.index

### Agregar Datos

Crearemos un dataset unificado que usaremos para entrenar el modelo de los embeddings

In [23]:
pois.head()

Unnamed: 0,id,lat,lng,visited_count,place_sid
0,8932,32.927662,-97.254356,11,0
1,8938,39.052824,-94.590311,20,1
2,8947,37.33188,-122.029631,312,2
3,8954,32.939436,-97.106009,7,3
4,8956,32.942655,-97.1312,6,4


In [24]:
pois.sort_values(by='visited_count', ascending=False)

Unnamed: 0,id,lat,lng,visited_count,place_sid
5489,23519,13.689897,100.748320,6252,5489
16486,55033,59.330158,18.058079,5306,16486
44215,155746,13.746659,100.534912,5215,44215
19878,66171,60.193511,11.098251,4989,19878
17528,58725,59.650051,17.932262,4827,17528
...,...,...,...,...,...
214080,6496200,52.533502,6.055011,5,214080
63364,258613,56.058742,14.596621,5,63364
63365,258619,50.932742,4.497062,5,63365
27428,90508,33.483327,-112.078883,5,27428


In [25]:
users.head()

Unnamed: 0,id,user_sid
0,111220,0
1,2430747,1
2,154026,2
3,217738,3
4,344284,4


In [26]:
checkins.head()

Unnamed: 0,userid,placeid,datetime
0,111220,8932,2010-07-29T23:17:36Z
1,2430747,8932,2011-06-10T01:46:00Z
2,154026,8932,2010-04-18T00:21:51Z
3,217738,8932,2010-07-25T18:13:48Z
4,217738,8932,2010-04-20T17:56:37Z


In [27]:
users_checkins = pd.merge(users, checkins, left_on='id', right_on='userid', copy=False).drop('id', axis=1)
users_checkins = pd.merge(users_checkins, pois[['id', 'place_sid']], left_on='placeid', right_on='id', copy=False)
users_checkins['date'] = pd.to_datetime(users_checkins['datetime'])
users_checkins = users_checkins.drop('datetime', axis=1)
users_checkins.sort_values(by=['user_sid', 'date'], inplace=True)
users_checkins.tail()

Unnamed: 0,user_sid,userid,placeid,id,place_sid,date
3803810,93573,736841,7389396,7389396,257378,2011-05-23 19:59:36+00:00
3803809,93573,736841,7389396,7389396,257378,2011-05-24 11:16:00+00:00
3803808,93573,736841,7389396,7389396,257378,2011-05-30 12:19:54+00:00
3803807,93573,736841,7389396,7389396,257378,2011-06-02 11:29:26+00:00
3803806,93573,736841,7389396,7389396,257378,2011-06-09 12:19:36+00:00


Podemos ver que hay un problema: Muchas veces se repiten los POI consecutivos de un usuario, sin embargo eso no aporta mucha información al embedding, por lo que los eliminaremos en el siguiente paso

In [28]:
users_checkins['last_place_sid'] = users_checkins['place_sid'].shift(1)
users_checkins = users_checkins[users_checkins.place_sid != users_checkins.last_place_sid]
users_checkins = users_checkins.drop('last_place_sid', axis=1)

In [29]:
users_checkins.tail()

Unnamed: 0,user_sid,userid,placeid,id,place_sid,date
3803805,93572,2481029,7421054,7421054,257398,2011-05-25 03:43:07+00:00
3803797,93572,2481029,7289932,7289932,257312,2011-06-14 04:24:21+00:00
3803801,93572,2481029,7421054,7421054,257398,2011-06-15 04:07:45+00:00
3803796,93572,2481029,7289932,7289932,257312,2011-06-21 04:05:43+00:00
3803813,93573,736841,7389396,7389396,257378,2011-05-17 15:02:18+00:00


In [30]:
from collections import defaultdict

In [31]:
user_poi_seq = defaultdict(lambda: [])
user_date_seq = defaultdict(lambda: [])

In [32]:
for user_sid, place_sid, date in zip(users_checkins['user_sid'], users_checkins['place_sid'], users_checkins['date']):
    user_poi_seq[user_sid].append(place_sid)
    user_date_seq[user_sid].append(date)

Generamos secuencias de puntos de interes visitados por usuarios de un largo predefinido para entrenar el modelo de embeddings

In [33]:
from random import sample

poi_sequence_dataset = []

for user_sid, sequence in user_poi_seq.items():
    if len(sequence) < SEQUENCE_LENGTH: continue

    candidate_indexes = list(range(0, len(sequence) - SEQUENCE_LENGTH, SEQUENCE_LENGTH))

    n_sequences = min(len(candidate_indexes), MAX_SEQUENCES_PER_USER)
    start_indexes = sample(candidate_indexes, n_sequences)

    for idx in start_indexes:
        new_seq = sequence[idx:idx + SEQUENCE_LENGTH]
        poi_sequence_dataset.append(new_seq) 
    

In [34]:
unique_pois = { poi for sequence in  poi_sequence_dataset  for poi in sequence }

In [35]:
# 

# rands = np.random.rand(len(poi_sequence_dataset))

# total = 0
# for idx, seq in enumerate(poi_sequence_dataset):
#   if idx % 100000 == 0:
#     print(idx, '/', len(poi_sequence_dataset))

#   index = int(rands[idx] *  len(pois_list))
#   total += 33991 in seq[-5:]

# print(total / len(poi_sequence_dataset))
  

In [36]:
def split_list(input, frac=0.5):
    split_index = int(len(input) * frac)
    return input[:split_index], input[split_index:]

## Split de Datos

Realizamos separacion en train / test / split de 80 / 10 / 10

In [37]:
train_poi_sequence, rest = split_list(poi_sequence_dataset, 0.8)
test_poi_sequence, val_poi_sequence = split_list(rest)

In [38]:
def split_history_target(sequences):
    history = [ seq[:-1] for seq in sequences]
    targets = [ seq[-1] for seq in sequences]
    return history, targets

In [39]:
train_seq_history, train_seq_target = split_history_target(train_poi_sequence)
test_seq_history, test_seq_target = split_history_target(test_poi_sequence)
val_seq_history, val_seq_target = split_history_target(val_poi_sequence)

In [40]:
unique_pois = { poi for sequence in  poi_sequence_dataset  for poi in sequence }

print("Total pois incluidos en dataset:", len(unique_pois))
print("Porcentaje de POIs que se usaran en el modelo",  len(unique_pois) * 100 / len(pois))

Total pois incluidos en dataset: 249048
Porcentaje de POIs que se usaran en el modelo 96.73156919635055


In [41]:
print("Total Train", len(train_seq_history))
print("Total Test ", len(test_seq_history))
print("Total Val  ", len(val_seq_history))

Total Train 153673
Total Test  19209
Total Val   19210


Ya tenemos nuestros datos listos para entrenar!

# Modelo

In [42]:
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size=None, emb_dim=None, hidden_dim=None, sample_length=None):
        super(EmbeddingModel, self).__init__()

        print("AAA", vocab_size, emb_dim, hidden_dim, sample_length)
        
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.hidden = nn.Linear(sample_length * emb_dim, hidden_dim)
        self.hidden_activation = nn.ReLU()
        self.output = nn.Linear(hidden_dim, vocab_size)
        self.output_activation = nn.LogSoftmax(dim=-1)

    def forward(self, xs):
        batch_size = xs.size()[0]

        # embed and merge
        xs = self.emb(xs)
        xs = torch.reshape(xs, (batch_size, -1))

        # hidden layer
        hidden = self.hidden(xs)
        hidden = self.hidden_activation(hidden)

        # output log probabilities
        output_logits = self.output(hidden)
        output_log_probs = self.output_activation(output_logits)
        
        return output_log_probs

    def predict(self, xs):
        return torch.argmax(self.forward(xs))

# Entrenamiento

In [43]:
from torch.utils.data import DataLoader

def dataset_to_tensors(sequences, targets):
    return [[torch.tensor(x), torch.tensor(y)] for x, y in zip(sequences, targets)]


train_tensors = dataset_to_tensors(train_seq_history, train_seq_target)
train_dataloader = DataLoader(train_tensors, batch_size=BATCH_SIZE, shuffle=True)

test_tensors = dataset_to_tensors(test_seq_history, test_seq_target)
test_dataloader = DataLoader(test_tensors, batch_size=BATCH_SIZE, shuffle=True)

val_tensors = dataset_to_tensors(val_seq_history, val_seq_target)
val_dataloader = DataLoader(val_tensors, batch_size=BATCH_SIZE, shuffle=True)

In [44]:
class EarlyStopper:
    def __init__(self, skip_first_n=2, patience=5, min_delta=0.05):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf
        self.total_counter = 0
        self.skip_first_n = skip_first_n

    def early_stop(self, validation_loss):
        self.total_counter += 1
        if self.total_counter <= self.skip_first_n: return

        if validation_loss < self.min_validation_loss - self.min_delta:
            self.min_validation_loss = validation_loss
            self.counter = 0
            return False

        self.counter += 1
        return self.counter >= self.patience

In [45]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Trainer:
    BASE_LR = 1e-2
    EPOCHS = 100
    PRINT_EVERY = 100
    VAL_EVERY = 500

    def __init__(self, vocab_size, embedding_dim, hidden_dim, sequence_length, train_dataloader, val_dataloader):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.sequence_length = sequence_length
        self.model = EmbeddingModel(
            vocab_size=self.vocab_size, emb_dim=self.embedding_dim, 
            hidden_dim=self.hidden_dim, sample_length=self.sequence_length
        )

        self.loss = nn.NLLLoss()
        # self.optimizer = optim.SGD(self.model.parameters(), lr=self.BASE_LR)
        self.optimizer = optim.Adam(self.model.parameters(), amsgrad=True, lr=self.BASE_LR)
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.1, patience=2, verbose=True)
        self.stopper = EarlyStopper(patience=6, min_delta=0.1)
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader

    
    def train(self):
        run = wandb.init(project="proyecto-recsys", 
            name=f"emb-{self.embedding_dim}-hidden-{self.hidden_dim}",
            config={
                "vocab_size": self.vocab_size,
                "embedding_dim": self.embedding_dim,
                "hidden_dim": self.hidden_dim,
                "sequence_length": self.sequence_length,
                "epochs": self.EPOCHS
            })

        wandb.watch(self.model)

        self.model.to(device)

        for epoch in range(self.EPOCHS):    
            stopped = self._train_epoch(epoch)
            
            if stopped:
                print(f"Early stopping at epoch {epoch}")
                break

        run.finish()
        return self.model

    def _train_epoch(self, epoch):
        print(f"Training model on epoch {epoch}")
        i = 1
        losses = []
        for xs, ys in self.train_dataloader:
            self.optimizer.zero_grad()
            
            xs, ys = xs.to(device), ys.to(device)

            output = self.loss(self.model(xs), ys)
            output.backward()
            self.optimizer.step()

            losses.append(output.item())

            if i % self.PRINT_EVERY == 0:
                avg_loss = sum(losses) / len(losses)
                losses = []
                wandb.log({"train_loss": avg_loss, "epoch": epoch, "step": i})
            
            i += 1

        print("\nEvaluating model on val set ...")
        self.model.eval()

        with torch.no_grad():
            val_iter = iter(self.val_dataloader)

            val_losses = []            
            for xs, ys in tqdm.tqdm(val_iter, total=len(val_iter)):
                xs, ys = xs.to(device), ys.to(device)
                output = self.loss(self.model(xs), ys)
                val_losses.append(output.item())

        avg_val_loss = sum(val_losses) / len(val_losses)

        wandb.log({"val_loss": avg_val_loss, "epoch": epoch, "step": i, 
                    "lr" : self.optimizer.param_groups[0]['lr']})
        
        self.scheduler.step(avg_val_loss)
        stop = self.stopper.early_stop(avg_val_loss)
        if stop: return True
        
        self.model.train()


### Parameters

In [46]:
import itertools

emb_dims = reversed([16, 32])
hidden_dims = reversed([10, 20, 40, 80, 150])

alt = [[64, 80], [64, 40], [64, 20], [64, 10]]

for emb_dim, hidden_dim in alt + list(itertools.product(emb_dims, hidden_dims)):
    trainer = Trainer(vocab_size=len(pois), embedding_dim=emb_dim, hidden_dim=hidden_dim, sequence_length=SEQUENCE_LENGTH - 1, 
                    train_dataloader=train_dataloader, val_dataloader=val_dataloader)
    trainer.train()

AAA 257463 64 80 13


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrafafdz[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training model on epoch 0

Evaluating model on val set ...


100%|██████████| 301/301 [00:01<00:00, 224.27it/s]


Training model on epoch 1

Evaluating model on val set ...


100%|██████████| 301/301 [00:01<00:00, 234.68it/s]


Training model on epoch 2

Evaluating model on val set ...


100%|██████████| 301/301 [00:01<00:00, 226.55it/s]


Training model on epoch 3

Evaluating model on val set ...


100%|██████████| 301/301 [00:01<00:00, 226.70it/s]


Epoch 00004: reducing learning rate of group 0 to 1.0000e-03.
Training model on epoch 4

Evaluating model on val set ...


100%|██████████| 301/301 [00:01<00:00, 226.07it/s]


Training model on epoch 5

Evaluating model on val set ...


100%|██████████| 301/301 [00:01<00:00, 229.01it/s]


Training model on epoch 6

Evaluating model on val set ...


100%|██████████| 301/301 [00:01<00:00, 228.39it/s]


Epoch 00007: reducing learning rate of group 0 to 1.0000e-04.
Training model on epoch 7

Evaluating model on val set ...


100%|██████████| 301/301 [00:01<00:00, 244.60it/s]


Training model on epoch 8

Evaluating model on val set ...


100%|██████████| 301/301 [00:01<00:00, 230.22it/s]

Early stopping at epoch 8





0,1
epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
lr,████▂▂▂▁▁
step,▁▃▄▆█▂▄▅▇▁▃▅▇▁▂▄▆█▂▄▅▇▁▃▅▆█▂▄▆▇▂▃▅▆█▃▄▆█
train_loss,▇▇███▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,▁█▅▆▅▅▅▅▅

0,1
epoch,8.0
lr,0.0001
step,2403.0
train_loss,11.05889
val_loss,16.95464


AAA 257463 64 40 13


Training model on epoch 0


In [None]:
def accuracy(model, test_seqs, test_targets):
    model = model.to(device)
    batch_size = 64
    corrects = 0
    for index in range(0, len(test_seqs), batch_size):
        x = torch.tensor(test_seqs[idx:idx + batch_size]).to(device)
        y = torch.tensor(test_targets[idx:idx + batch_size]).to(device)
        pred = model.predict(x)
        corrects += torch.sum(pred == y).item()
    return  corrects / len(test_seqs)

In [None]:
accuracy(lm, test_seq_history, test_seq_target)

0.0