#  Librerías

### Librerías pesadas
Para ejecutar solo una vez

In [52]:
import math
import torch
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np
import tqdm
import wandb
import random

In [53]:
!wandb login 99217068fbd71985701543b0c0064e805ac87449

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/rafael/.netrc


In [54]:
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)


### Librerías livianas
Para ejecutar múltiples veces

# Configuración General

Variables relacionadas al procesamiento de datos y del modelo en sí

### Variables de Preprocesamiento

In [55]:

# Porcentaje para usar solo una fracción del dataset de usuario.
# si al eliminar usuarios quedan viajes o POI sin visitas, estos también
# serán eliminados
USER_FRAC = 0.65
MIN_POI_VISITS = 5
MAX_SEQUENCES_PER_USER = 100
SEQUENCE_LENGTH = 14

In [56]:
BATCH_SIZE=64
EPOCHS=100

In [57]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 80

# Gowalla Dataset

In [58]:
! ./download-gowalla.sh

Already Downloaded


In [59]:
# !mkdir -p download
# !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=FILEID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=0BzpKyxX1dqTYRTFVYTd1UG81ZXc" -O download/gowalla.zip && rm -rf /tmp/cookies.txt
# !unzip download/gowalla.zip -d download

### Cargar Datos

In [60]:
users    = pd.read_csv('download/gowalla/gowalla_userinfo.csv')
friends  = pd.read_csv('download/gowalla/gowalla_friendship.csv')
checkins = pd.read_csv('download/gowalla/gowalla_checkins.csv')
pois_1   = pd.read_csv('download/gowalla/gowalla_spots_subset1.csv', encoding='iso-8859-1')
pois_2   = pd.read_csv('download/gowalla/gowalla_spots_subset2.csv', encoding='iso-8859-1')
pois     = pd.concat((pois_1, pois_2), ignore_index=True)

# Preprocesamiento

### Usuarios

Revisamos la distrubución de checkins de usuarios

In [61]:
users.sample(5)

Unnamed: 0,id,bookmarked_spots_count,challenge_pin_count,country_pin_count,highlights_count,items_count,photos_count,pins_count,province_pin_count,region_pin_count,state_pin_count,trips_count,friends_count,stamps_count,checkin_num,places_num
341610,2382731,0,1,1,0,0,0,2,0,1,0,0,2,2,2,2
116459,255417,1,14,3,0,6,0,17,0,3,0,0,7,661,1076,661
2173,4025,0,1,1,0,5,0,3,1,2,0,0,2,1,1,1
73987,152935,0,39,1,5,6,28,60,0,10,9,0,43,841,1215,844
191020,1084981,1,6,2,0,6,0,8,0,2,0,0,5,33,45,33


In [62]:
users.checkin_num.describe()

count    407533.000000
mean         88.341212
std         435.982581
min           0.000000
25%           1.000000
50%          10.000000
75%          52.000000
max       46981.000000
Name: checkin_num, dtype: float64

nos quedamos con un porcentaje de los usuarios y los filtramos los usuarios segun los checkins que tengan

In [63]:
users.checkin_num.describe()

count    407533.000000
mean         88.341212
std         435.982581
min           0.000000
25%           1.000000
50%          10.000000
75%          52.000000
max       46981.000000
Name: checkin_num, dtype: float64

In [64]:
print('Current users', len(users))
users = users.sample(frac=USER_FRAC)
users = users[(users.checkin_num >= users.checkin_num.quantile(0.1)) &
              (users.checkin_num <= users.checkin_num.quantile(0.9)) &
              (users.checkin_num >= SEQUENCE_LENGTH)]
users = users[['id']]
print('Reduced users', len(users))

Current users 407533
Reduced users 93889


### Amigos

In [65]:
friends.sample(5)

Unnamed: 0,userid1,userid2
4022741,2234455,398150
3922638,2398097,2147996
2872247,2430156,2417234
2386885,2345388,2285831
3278037,2531752,2224690


### Checkins

In [66]:
checkins.sample(5)

Unnamed: 0,userid,placeid,datetime
32287102,2178887,6468486,2010-12-17T05:09:22Z
4705834,98877,21192,2010-09-04T17:19:06Z
28854476,378467,5305167,2011-05-08T14:59:35Z
8786626,4267,84141,2010-01-02T10:44:26Z
11313982,80739,84407,2010-03-24T17:54:53Z


Eliminamos los checkins de los usuarios no sampleados

In [67]:
print('Current checkins', len(checkins))
checkins = pd.merge(checkins, users, how='inner', left_on='userid', right_on='id', copy=False)[checkins.columns]
checkins = checkins.reset_index(drop=True)
print('Reduced checkins', len(checkins))

Current checkins 36001959
Reduced checkins 5491669


### POIS

In [68]:
pois.sample(5)

Unnamed: 0,id,created_at,lng,lat,photos_count,checkins_count,users_count,radius_meters,highlights_count,items_count,max_items_count,spot_categories,name,city_state,Unnamed: 5,Unnamed: 6
153640,174460,2009-12-06T18:59:56Z,-75.137764,40.231966,3.0,33.0,22.0,75.0,0.0,3.0,10.0,"[{'url': '/categories/15', 'name': 'Mexican'}]",,,,
596687,668806,2010-03-09T02:07:15Z,-122.197139,37.463542,0.0,10.0,8.0,75.0,0.0,1.0,10.0,"[{'url': '/categories/166', 'name': 'Historic ...",,,,
2728830,554897,,-112.012533,33.304666,,,,,,,,,Music Makers,"Phoenix, AZ",,
2169462,6827452,2011-01-28T10:24:03Z,103.90482,1.329476,0.0,2.0,1.0,75.0,0.0,0.0,10.0,"[{'url': '/categories/18', 'name': 'Asian'}]",,,,
118096,135426,2009-11-26T20:08:21Z,-96.985616,32.84484,0.0,7.0,4.0,35.0,0.0,0.0,10.0,"[{'url': '/categories/89', 'name': 'Craftsman'}]",,,,


Filtramos los pois si han sido visitadas pocas veces según los parámetros que definimos

In [69]:
visited_pois = pd.merge(pois, checkins, left_on='id', right_on='placeid', how='inner', copy=False)

In [70]:
visited_pois['visited_count'] = np.zeros(len(visited_pois))

visited_pois = visited_pois[['id', 'visited_count']].groupby(by='id').count()
visited_pois = visited_pois[visited_pois.visited_count >= MIN_POI_VISITS]

pois = pd.merge(pois, visited_pois, on='id', how='inner', copy=False)

Nos quedamos con sólo las columnas que nos importan

In [71]:
pois = pois[['id', 'lat', 'lng', 'visited_count']]
pois 

Unnamed: 0,id,lat,lng,visited_count
0,8932,32.927662,-97.254356,6
1,8936,39.053318,-94.591995,5
2,8938,39.052824,-94.590311,25
3,8947,37.331880,-122.029631,344
4,8956,32.942655,-97.131200,9
...,...,...,...,...
256355,7519716,18.061014,-66.721559,5
256356,7527534,13.844906,100.855976,5
256357,7529626,51.435737,-3.174222,5
256358,7533476,35.847316,-78.805891,6


Ahora eliminamos los checkins de pois que ya no existen

In [72]:
print('Current checkins', len(checkins))
checkins = pd.merge(pois, checkins, left_on='id', right_on='placeid', how='inner', copy=False)[checkins.columns]
checkins = checkins.reset_index(drop=True)
print('Reduced checkins', len(checkins))

Current checkins 5491669
Reduced checkins 3775509


Finalmente eliminamos nuevamente a los usuarios que se quedaron sin suficientes checkins

In [73]:
print('Current users', len(users))
users = pd.merge(checkins, users, how='inner', left_on='userid', right_on='id', copy=False)[users.columns].drop_duplicates()
print('Reduced users', len(users))

Current users 93889
Reduced users 93653


### Reasignación de IDs

In [74]:
users = users.reset_index(drop=True)
users['user_sid'] = users.index

pois = pois.reset_index(drop=True)
pois['place_sid'] = pois.index

### Agregar Datos

Crearemos un dataset unificado que usaremos para entrenar el modelo de los embeddings

In [75]:
pois.head()

Unnamed: 0,id,lat,lng,visited_count,place_sid
0,8932,32.927662,-97.254356,6,0
1,8936,39.053318,-94.591995,5,1
2,8938,39.052824,-94.590311,25,2
3,8947,37.33188,-122.029631,344,3
4,8956,32.942655,-97.1312,9,4


In [76]:
pois.sort_values(by='visited_count', ascending=False)

Unnamed: 0,id,lat,lng,visited_count,place_sid
5424,23519,13.689897,100.748320,5876,5424
16339,55033,59.330158,18.058079,5283,16339
44029,155746,13.746659,100.534912,5071,44029
19684,66171,60.193511,11.098251,5040,19684
17359,58725,59.650051,17.932262,4804,17359
...,...,...,...,...,...
211598,6474356,65.497450,21.911010,5,211598
118205,732513,44.267990,-88.476317,5,118205
118201,732418,42.048607,-87.685467,5,118201
118193,732344,35.683227,139.615369,5,118193


In [77]:
users.head()

Unnamed: 0,id,user_sid
0,217738,0
1,344284,1
2,1808,2
3,312345,3
4,391806,4


In [78]:
checkins.head()

Unnamed: 0,userid,placeid,datetime
0,217738,8932,2010-07-25T18:13:48Z
1,217738,8932,2010-04-20T17:56:37Z
2,344284,8932,2010-11-13T20:07:16Z
3,1808,8932,2009-05-27T20:59:09Z
4,312345,8932,2010-08-02T01:17:30Z


In [79]:
users_checkins = pd.merge(users, checkins, left_on='id', right_on='userid', copy=False).drop('id', axis=1)
users_checkins = pd.merge(users_checkins, pois[['id', 'place_sid']], left_on='placeid', right_on='id', copy=False)
users_checkins['date'] = pd.to_datetime(users_checkins['datetime'])
users_checkins = users_checkins.drop('datetime', axis=1)
users_checkins.sort_values(by=['user_sid', 'date'], inplace=True)
users_checkins.tail()

Unnamed: 0,user_sid,userid,placeid,id,place_sid,date
3775505,93652,2679347,7511294,7511294,256353,2011-06-27 04:06:53+00:00
3775496,93652,2679347,7510411,7510411,256351,2011-06-28 15:33:48+00:00
3775504,93652,2679347,7511294,7511294,256353,2011-06-29 05:01:17+00:00
3775503,93652,2679347,7511294,7511294,256353,2011-06-30 04:10:25+00:00
3775495,93652,2679347,7510411,7510411,256351,2011-06-30 16:21:21+00:00


Podemos ver que hay un problema: Muchas veces se repiten los POI consecutivos de un usuario, sin embargo eso no aporta mucha información al embedding, por lo que los eliminaremos en el siguiente paso

In [80]:
users_checkins['last_place_sid'] = users_checkins['place_sid'].shift(1)
users_checkins = users_checkins[users_checkins.place_sid != users_checkins.last_place_sid]
users_checkins = users_checkins.drop('last_place_sid', axis=1)

In [81]:
users_checkins.tail()

Unnamed: 0,user_sid,userid,placeid,id,place_sid,date
3775497,93652,2679347,7510411,7510411,256351,2011-06-23 17:21:21+00:00
3775505,93652,2679347,7511294,7511294,256353,2011-06-27 04:06:53+00:00
3775496,93652,2679347,7510411,7510411,256351,2011-06-28 15:33:48+00:00
3775504,93652,2679347,7511294,7511294,256353,2011-06-29 05:01:17+00:00
3775495,93652,2679347,7510411,7510411,256351,2011-06-30 16:21:21+00:00


In [82]:
from collections import defaultdict

In [83]:
user_poi_seq = defaultdict(lambda: [])
user_date_seq = defaultdict(lambda: [])

In [84]:
for user_sid, place_sid, date in zip(users_checkins['user_sid'], users_checkins['place_sid'], users_checkins['date']):
    user_poi_seq[user_sid].append(place_sid)
    user_date_seq[user_sid].append(date)

Generamos secuencias de puntos de interes visitados por usuarios de un largo predefinido para entrenar el modelo de embeddings

In [85]:
from random import sample

poi_sequence_dataset = []

for user_sid, sequence in user_poi_seq.items():
    if len(sequence) < SEQUENCE_LENGTH: continue

    candidate_indexes = list(range(0, len(sequence) - SEQUENCE_LENGTH, SEQUENCE_LENGTH))

    n_sequences = min(len(candidate_indexes), MAX_SEQUENCES_PER_USER)
    start_indexes = sample(candidate_indexes, n_sequences)

    for idx in start_indexes:
        new_seq = sequence[idx:idx + SEQUENCE_LENGTH]
        poi_sequence_dataset.append(new_seq) 
    

In [86]:
unique_pois = { poi for sequence in  poi_sequence_dataset  for poi in sequence }

In [87]:
# 

# rands = np.random.rand(len(poi_sequence_dataset))

# total = 0
# for idx, seq in enumerate(poi_sequence_dataset):
#   if idx % 100000 == 0:
#     print(idx, '/', len(poi_sequence_dataset))

#   index = int(rands[idx] *  len(pois_list))
#   total += 33991 in seq[-5:]

# print(total / len(poi_sequence_dataset))
  

In [88]:
def split_list(input, frac=0.5):
    split_index = int(len(input) * frac)
    return input[:split_index], input[split_index:]

## Split de Datos

Realizamos separacion en train / test / split de 80 / 10 / 10

In [89]:
train_poi_sequence, rest = split_list(poi_sequence_dataset, 0.8)
test_poi_sequence, val_poi_sequence = split_list(rest)

In [90]:
def split_history_target(sequences):
    history = [ seq[:-1] for seq in sequences]
    targets = [ seq[-1] for seq in sequences]
    return history, targets

In [91]:
train_seq_history, train_seq_target = split_history_target(train_poi_sequence)
test_seq_history, test_seq_target = split_history_target(test_poi_sequence)
val_seq_history, val_seq_target = split_history_target(val_poi_sequence)

In [92]:
unique_pois = { poi for sequence in  poi_sequence_dataset  for poi in sequence }

print("Total pois incluidos en dataset:", len(unique_pois))
print("Porcentaje de POIs que se usaran en el modelo",  len(unique_pois) * 100 / len(pois))

Total pois incluidos en dataset: 247850
Porcentaje de POIs que se usaran en el modelo 96.68044936807614


In [93]:
print("Total Train", len(train_seq_history))
print("Total Test ", len(test_seq_history))
print("Total Val  ", len(val_seq_history))

Total Train 152232
Total Test  19029
Total Val   19029


Ya tenemos nuestros datos listos para entrenar!

# Modelo

In [94]:
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size=None, emb_dim=None, hidden_dim=None, sample_length=None):
        super(EmbeddingModel, self).__init__()

        print("AAA", vocab_size, emb_dim, hidden_dim, sample_length)
        
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.hidden = nn.Linear(sample_length * emb_dim, hidden_dim)
        self.hidden_activation = nn.ReLU()
        self.output = nn.Linear(hidden_dim, vocab_size)
        self.output_activation = nn.LogSoftmax(dim=-1)

    def forward(self, xs):
        batch_size = xs.size()[0]

        # embed and merge
        xs = self.emb(xs)
        xs = torch.reshape(xs, (batch_size, -1))

        # hidden layer
        hidden = self.hidden(xs)
        hidden = self.hidden_activation(hidden)

        # output log probabilities
        output_logits = self.output(hidden)
        output_log_probs = self.output_activation(output_logits)
        
        return output_log_probs

    def predict(self, xs):
        return torch.argmax(self.forward(xs))

# Entrenamiento

In [95]:
from torch.utils.data import DataLoader

def dataset_to_tensors(sequences, targets):
    return [[torch.tensor(x), torch.tensor(y)] for x, y in zip(sequences, targets)]


train_tensors = dataset_to_tensors(train_seq_history, train_seq_target)
train_dataloader = DataLoader(train_tensors, batch_size=BATCH_SIZE, shuffle=True)

test_tensors = dataset_to_tensors(test_seq_history, test_seq_target)
test_dataloader = DataLoader(test_tensors, batch_size=BATCH_SIZE, shuffle=True)

val_tensors = dataset_to_tensors(val_seq_history, val_seq_target)
val_dataloader = DataLoader(val_tensors, batch_size=BATCH_SIZE, shuffle=True)

In [96]:
class EarlyStopper:
    def __init__(self, skip_first_n=2, patience=5, min_delta=0.05):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf
        self.total_counter = 0
        self.skip_first_n = skip_first_n

    def early_stop(self, validation_loss):
        self.total_counter += 1
        if self.total_counter <= self.skip_first_n: return

        if validation_loss < self.min_validation_loss - self.min_delta:
            self.min_validation_loss = validation_loss
            self.counter = 0
            return False

        self.counter += 1
        return self.counter >= self.patience

In [97]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Trainer:
    BASE_LR = 1e-2
    EPOCHS = 100
    PRINT_EVERY = 100
    VAL_EVERY = 500

    def __init__(self, vocab_size, embedding_dim, hidden_dim, sequence_length, train_dataloader, val_dataloader):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.sequence_length = sequence_length
        self.model = EmbeddingModel(
            vocab_size=self.vocab_size, emb_dim=self.embedding_dim, 
            hidden_dim=self.hidden_dim, sample_length=self.sequence_length
        )

        self.loss = nn.NLLLoss()
        # self.optimizer = optim.SGD(self.model.parameters(), lr=self.BASE_LR)
        self.optimizer = optim.Adam(self.model.parameters(), amsgrad=True, lr=self.BASE_LR)
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.1, patience=2, verbose=True)
        self.stopper = EarlyStopper(patience=6, min_delta=0.1)
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader

    
    def train(self):
        run = wandb.init(project="proyecto-recsys", 
            name=f"emb-{self.embedding_dim}-hidden-{self.hidden_dim}",
            config={
                "vocab_size": self.vocab_size,
                "embedding_dim": self.embedding_dim,
                "hidden_dim": self.hidden_dim,
                "sequence_length": self.sequence_length,
                "epochs": self.EPOCHS
            })

        wandb.watch(self.model)

        self.model.to(device)

        for epoch in range(self.EPOCHS):    
            stopped = self._train_epoch(epoch)
            
            if stopped:
                print(f"Early stopping at epoch {epoch}")
                break

        run.finish()
        return self.model

    def _train_epoch(self, epoch):
        print(f"Training model on epoch {epoch}")
        i = 1
        losses = []
        for xs, ys in self.train_dataloader:
            self.optimizer.zero_grad()
            
            xs, ys = xs.to(device), ys.to(device)

            output = self.loss(self.model(xs), ys)
            output.backward()
            self.optimizer.step()

            losses.append(output.item())

            if i % self.PRINT_EVERY == 0:
                avg_loss = sum(losses) / len(losses)
                losses = []
                wandb.log({"train_loss": avg_loss, "epoch": epoch, "step": i})
            
            i += 1

        print("\nEvaluating model on val set ...")
        self.model.eval()

        with torch.no_grad():
            val_iter = iter(self.val_dataloader)

            val_losses = []            
            for xs, ys in tqdm.tqdm(val_iter, total=len(val_iter)):
                xs, ys = xs.to(device), ys.to(device)
                output = self.loss(self.model(xs), ys)
                val_losses.append(output.item())

        avg_val_loss = sum(val_losses) / len(val_losses)

        wandb.log({"val_loss": avg_val_loss, "epoch": epoch, "step": i, 
                    "lr" : self.optimizer.param_groups[0]['lr']})
        
        self.scheduler.step(avg_val_loss)
        stop = self.stopper.early_stop(avg_val_loss)
        if stop: return True
        
        self.model.train()


In [103]:
torch.save(trainer.model.emb.state_dict(), f"emb-{len(pois)}-16D.pt")

In [104]:
!ls

download	     emb-256360-16D.pt	readme.md	  venv
download-gowalla.sh  embeddings.ipynb	requirements.txt  wandb


### Parameters

In [98]:
import itertools

# emb_dims = [reversed([16, 32])]
# hidden_dims = reversed([10, 20, 40, 80, 150])

# alt = [[64, 80], [64, 40], [64, 20], [64, 10]]

emb_dims, hidden_dims = [16], [10]

for emb_dim, hidden_dim in itertools.product(emb_dims, hidden_dims):
    trainer = Trainer(vocab_size=len(pois), embedding_dim=emb_dim, hidden_dim=hidden_dim, sequence_length=SEQUENCE_LENGTH - 1, 
                    train_dataloader=train_dataloader, val_dataloader=val_dataloader)
    trainer.train()

AAA 256360 16 10 13


Training model on epoch 0

Evaluating model on val set ...


100%|████████████████████████████████████████████████████████████████████████████| 298/298 [00:00<00:00, 358.93it/s]


Training model on epoch 1

Evaluating model on val set ...


100%|████████████████████████████████████████████████████████████████████████████| 298/298 [00:00<00:00, 360.14it/s]


Training model on epoch 2

Evaluating model on val set ...


100%|████████████████████████████████████████████████████████████████████████████| 298/298 [00:00<00:00, 358.86it/s]


Training model on epoch 3

Evaluating model on val set ...


100%|████████████████████████████████████████████████████████████████████████████| 298/298 [00:00<00:00, 359.75it/s]


Epoch 00004: reducing learning rate of group 0 to 1.0000e-03.
Training model on epoch 4

Evaluating model on val set ...


100%|████████████████████████████████████████████████████████████████████████████| 298/298 [00:00<00:00, 359.37it/s]


Training model on epoch 5

Evaluating model on val set ...


100%|████████████████████████████████████████████████████████████████████████████| 298/298 [00:00<00:00, 360.01it/s]


Training model on epoch 6

Evaluating model on val set ...


100%|████████████████████████████████████████████████████████████████████████████| 298/298 [00:00<00:00, 358.98it/s]


Epoch 00007: reducing learning rate of group 0 to 1.0000e-04.
Training model on epoch 7

Evaluating model on val set ...


100%|████████████████████████████████████████████████████████████████████████████| 298/298 [00:00<00:00, 359.59it/s]


Training model on epoch 8

Evaluating model on val set ...


100%|████████████████████████████████████████████████████████████████████████████| 298/298 [00:00<00:00, 359.52it/s]

Early stopping at epoch 8





0,1
epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇█████
lr,████▂▂▂▁▁
step,▁▂▄▆▇▂▄▅▇▁▃▅▆█▂▄▅▇▂▄▅▇▁▃▄▆█▂▄▅▇▂▃▅▇▁▂▄▆█
train_loss,▆▆███▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,▁▄▆██████

0,1
epoch,8.0
lr,0.0001
step,2380.0
train_loss,11.07717
val_loss,16.0795


In [99]:
def accuracy(model, test_seqs, test_targets):
    model = model.to(device)
    batch_size = 64
    corrects = 0
    for index in range(0, len(test_seqs), batch_size):
        x = torch.tensor(test_seqs[idx:idx + batch_size]).to(device)
        y = torch.tensor(test_targets[idx:idx + batch_size]).to(device)
        pred = model.predict(x)
        corrects += torch.sum(pred == y).item()
    return  corrects / len(test_seqs)

In [100]:
accuracy(lm, test_seq_history, test_seq_target)

NameError: name 'lm' is not defined