#  Librerías

### Librerías pesadas
Para ejecutar solo una vez

In [101]:
import math
import torch
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np
import tqdm

### Librerías livianas
Para ejecutar múltiples veces

# Configuración General

Variables relacionadas al procesamiento de datos y del modelo en sí

### Variables de Preprocesamiento

In [2]:

# Porcentaje para usar solo una fracción del dataset de usuario.
# si al eliminar usuarios quedan viajes o POI sin visitas, estos también
# serán eliminados
USER_FRAC = 0.3
MIN_POI_VISITS = 5
MAX_SEQUENCES_PER_USER = 200
SEQUENCE_LENGTH = 14

In [93]:
BATCH_SIZE=64

In [94]:
EMBEDDING_DIM = 60
HIDDEN_DIM = 50

# Gowalla Dataset

In [4]:
! ./download-gowalla.sh

Already Downloaded


### Cargar Datos

In [5]:
users    = pd.read_csv('download/gowalla/gowalla_userinfo.csv')
friends  = pd.read_csv('download/gowalla/gowalla_friendship.csv')
checkins = pd.read_csv('download/gowalla/gowalla_checkins.csv')
pois_1   = pd.read_csv('download/gowalla/gowalla_spots_subset1.csv', encoding='iso-8859-1')
pois_2   = pd.read_csv('download/gowalla/gowalla_spots_subset2.csv', encoding='iso-8859-1')
pois     = pd.concat((pois_1, pois_2), ignore_index=True)


# Preprocesamiento

### Usuarios

Revisamos la distrubución de checkins de usuarios

In [6]:
users.sample(5)

Unnamed: 0,id,bookmarked_spots_count,challenge_pin_count,country_pin_count,highlights_count,items_count,photos_count,pins_count,province_pin_count,region_pin_count,state_pin_count,trips_count,friends_count,stamps_count,checkin_num,places_num
362514,2492511,0,2,1,0,0,0,3,0,1,0,0,1,2,2,2
230751,2090506,0,21,1,3,29,11,23,0,2,1,1,18,83,200,86
365629,2512718,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0
6672,12116,0,22,1,0,2,81,29,0,7,6,0,14,167,332,168
12685,21897,0,11,1,0,10,7,15,0,4,3,0,5,195,431,195


In [7]:
users.checkin_num.describe()

count    407533.000000
mean         88.341212
std         435.982581
min           0.000000
25%           1.000000
50%          10.000000
75%          52.000000
max       46981.000000
Name: checkin_num, dtype: float64

nos quedamos con un porcentaje de los usuarios y los filtramos los usuarios segun los checkins que tengan

In [8]:
print('Current users', len(users))
users = users.sample(frac=USER_FRAC)
users = users[users.checkin_num >= SEQUENCE_LENGTH]
users = users[['id']]
print('Reduced users', len(users))

Current users 407533
Reduced users 55357


In [9]:
checkins[checkins.userid ==  69996]

Unnamed: 0,userid,placeid,datetime
4223526,69996,269432,2011-06-01T19:10:19Z
4223527,69996,61280,2011-05-31T20:35:45Z
4223528,69996,49812,2011-05-30T23:29:12Z
4223529,69996,205779,2011-05-30T23:28:53Z
4223530,69996,6511613,2011-05-30T23:28:25Z
...,...,...,...
4224301,69996,282847,2009-12-28T12:48:22Z
4224302,69996,117306,2009-12-28T12:45:56Z
4224303,69996,150947,2009-12-27T22:37:39Z
4224304,69996,261322,2009-12-27T22:27:05Z


### Amigos

In [10]:
friends.sample(5)

Unnamed: 0,userid1,userid2
2113874,206015,159634
3783422,1128761,2526875
3076186,375966,2187455
1701141,146453,218318
1232589,2193003,48099


### Checkins

In [11]:
checkins.sample(5)

Unnamed: 0,userid,placeid,datetime
8242903,721,21625,2010-05-19T12:30:02Z
7704276,712669,862120,2011-03-18T17:31:46Z
13703935,76757,853557,2010-07-16T19:02:01Z
34175734,1326365,58725,2010-10-14T18:06:24Z
112316,9298,3417558,2010-09-19T01:04:34Z


Eliminamos los checkins de los usuarios no sampleados

In [12]:
print('Current checkins', len(checkins))
checkins = pd.merge(checkins, users, how='inner', left_on='userid', right_on='id', copy=False)[checkins.columns]
checkins = checkins.reset_index(drop=True)
print('Reduced checkins', len(checkins))

Current checkins 36001959
Reduced checkins 10603590


### POIS

In [13]:
pois.sample(5)

Unnamed: 0,id,created_at,lng,lat,photos_count,checkins_count,users_count,radius_meters,highlights_count,items_count,max_items_count,spot_categories,name,city_state,Unnamed: 5,Unnamed: 6
1654161,4835948,2010-10-02T21:01:03Z,13.018865,55.718491,1.0,2.0,1.0,75.0,0.0,0.0,10.0,"[{'url': '/categories/56', 'name': 'Other - Ni...",,,,
1712873,5541758,2010-10-15T20:35:43Z,-97.327077,32.754597,1.0,14.0,13.0,75.0,0.0,1.0,10.0,"[{'url': '/categories/59', 'name': 'Other - Ar...",,,,
614457,691214,2010-03-12T12:19:58Z,103.780616,1.283401,3.0,12.0,10.0,75.0,1.0,0.0,10.0,"[{'url': '/categories/17', 'name': 'BBQ'}]",,,,
417472,466110,2010-01-28T15:40:09Z,-77.030687,38.893391,0.0,33.0,29.0,75.0,0.0,2.0,10.0,"[{'url': '/categories/49', 'name': 'Luxury Hot...",,,,
1746108,5970648,2010-10-23T00:52:57Z,-77.005845,40.212998,0.0,11.0,8.0,100.0,0.0,3.0,10.0,"[{'url': '/categories/153', 'name': 'Other - C...",,,,


Filtramos los pois si han sido visitadas pocas veces según los parámetros que definimos

In [14]:
visited_pois = pd.merge(pois, checkins, left_on='id', right_on='placeid', how='inner', copy=False)

In [15]:
visited_pois['visited_count'] = np.zeros(len(visited_pois))

visited_pois = visited_pois[['id', 'visited_count']].groupby(by='id').count()
visited_pois = visited_pois[visited_pois.visited_count >= MIN_POI_VISITS]

pois = pd.merge(pois, visited_pois, on='id', how='inner', copy=False)

Nos quedamos con sólo las columnas que nos importan

In [16]:
pois = pois[['id', 'lat', 'lng', 'visited_count']]
pois 

Unnamed: 0,id,lat,lng,visited_count
0,8932,32.927662,-97.254356,20
1,8936,39.053318,-94.591995,14
2,8938,39.052824,-94.590311,53
3,8947,37.331880,-122.029631,951
4,8954,32.939436,-97.106009,34
...,...,...,...,...
480724,7517751,41.161991,-104.761128,8
480725,7523299,51.484481,-0.312275,5
480726,7523748,27.517234,-82.727902,7
480727,7526534,37.458915,-90.933160,9


Ahora eliminamos los checkins de pois que ya no existen

In [17]:
print('Current checkins', len(checkins))
checkins = pd.merge(pois, checkins, left_on='id', right_on='placeid', how='inner', copy=False)[checkins.columns]
checkins = checkins.reset_index(drop=True)
print('Reduced checkins', len(checkins))

Current checkins 10603590
Reduced checkins 8282199


Finalmente eliminamos nuevamente a los usuarios que se quedaron sin suficientes checkins

In [18]:
print('Current users', len(users))
users = pd.merge(checkins, users, how='inner', left_on='userid', right_on='id', copy=False)[users.columns].drop_duplicates()
print('Reduced users', len(users))

Current users 55357
Reduced users 55264


### Reasignación de IDs

In [19]:
users = users.reset_index(drop=True)
users['user_sid'] = users.index

pois = pois.reset_index(drop=True)
pois['place_sid'] = pois.index

### Agregar Datos

Crearemos un dataset unificado que usaremos para entrenar el modelo de los embeddings

In [20]:
pois.head()

Unnamed: 0,id,lat,lng,visited_count,place_sid
0,8932,32.927662,-97.254356,20,0
1,8936,39.053318,-94.591995,14,1
2,8938,39.052824,-94.590311,53,2
3,8947,37.33188,-122.029631,951,3
4,8954,32.939436,-97.106009,34,4


In [21]:
users.head()

Unnamed: 0,id,user_sid
0,3092,0
1,220,1
2,1496,2
3,325666,3
4,100440,4


In [22]:
checkins.head()

Unnamed: 0,userid,placeid,datetime
0,3092,8932,2010-02-20T01:08:20Z
1,220,8932,2009-12-30T05:56:37Z
2,1496,8932,2009-11-24T21:57:04Z
3,325666,8932,2011-03-21T00:35:28Z
4,325666,8932,2011-02-23T23:28:44Z


In [23]:
users_checkins = pd.merge(users, checkins, left_on='id', right_on='userid', copy=False).drop('id', axis=1)
users_checkins = pd.merge(users_checkins, pois[['id', 'place_sid']], left_on='placeid', right_on='id', copy=False)
users_checkins['date'] = pd.to_datetime(users_checkins['datetime'])
users_checkins = users_checkins.drop('datetime', axis=1)
users_checkins.sort_values(by=['user_sid', 'date'], inplace=True)

In [24]:
users_checkins[['user_sid', 'place_sid']].head(20)

Unnamed: 0,user_sid,place_sid
18737,0,8643
3212,0,93
12922,0,1395
18736,0,8643
19544,0,11438
5639,0,557
19691,0,11685
19185,0,10082
18735,0,8643
4521,0,324


In [25]:
users_checkins[['user_sid', 'place_sid']].tail(20)

Unnamed: 0,user_sid,place_sid
8282144,55263,480604
8282202,55263,480605
8282143,55263,480604
8282201,55263,480605
8282142,55263,480604
8282200,55263,480605
8282141,55263,480604
8282199,55263,480605
8282140,55263,480604
8282139,55263,480604


Podemos ver que hay un problema: Muchas veces se repiten los POI consecutivos de un usuario, sin embargo eso no aporta mucha información al embedding, por lo que los eliminaremos en el siguiente paso

In [26]:
users_checkins['last_place_sid'] = users_checkins['place_sid'].shift(1)
user_checkins = users_checkins[users_checkins.place_sid != users_checkins.last_place_sid]
user_checkins = user_checkins.drop('last_place_sid', axis=1)

In [27]:
user_checkins.tail()

Unnamed: 0,user_sid,userid,placeid,id,place_sid,date
8282136,55263,2615815,7361926,7361926,480604,2011-07-02 02:33:01+00:00
8282196,55263,2615815,7361973,7361973,480605,2011-07-02 11:38:43+00:00
8282135,55263,2615815,7361926,7361926,480604,2011-07-03 02:46:12+00:00
8282195,55263,2615815,7361973,7361973,480605,2011-07-03 10:53:14+00:00
8282134,55263,2615815,7361926,7361926,480604,2011-07-03 23:47:06+00:00


In [28]:
from collections import defaultdict

In [29]:
user_poi_seq = defaultdict(lambda: [])
user_date_seq = defaultdict(lambda: [])

In [30]:
for user_sid, place_sid, date in zip(users_checkins['user_sid'], users_checkins['place_sid'], users_checkins['date']):
    user_poi_seq[user_sid].append(place_sid)
    user_date_seq[user_sid].append(date)

Generamos secuencias de puntos de interes visitados por usuarios de un largo predefinido para entrenar el modelo de embeddings

In [31]:
from random import sample

poi_sequence_dataset = []

for user_sid, sequence in user_poi_seq.items():
    if len(sequence) < SEQUENCE_LENGTH: continue

    candidate_indexes = list(range(0, len(sequence) - SEQUENCE_LENGTH, SEQUENCE_LENGTH))

    n_sequences = min(len(candidate_indexes), MAX_SEQUENCES_PER_USER)
    start_indexes = sample(candidate_indexes, n_sequences)

    for idx in start_indexes:
        new_seq = sequence[idx:idx + SEQUENCE_LENGTH]
        poi_sequence_dataset.append(new_seq) 
    

In [32]:
def split_list(input, frac=0.5):
    split_index = int(len(input) * frac)
    return input[:split_index], input[split_index:]

## Split de Datos

Realizamos separacion en train / test / split de 80 / 10 / 10

In [33]:
train_poi_sequence, rest = split_list(poi_sequence_dataset, 0.8)
test_poi_sequence, val_poi_sequence = split_list(rest)

In [34]:
def split_history_target(sequences):
    history = [ seq[:-1] for seq in sequences]
    targets = [ seq[-1] for seq in sequences]
    return history, targets

In [35]:
train_seq_history, train_seq_target = split_history_target(train_poi_sequence)
test_seq_history, test_seq_target = split_history_target(test_poi_sequence)
val_seq_history, val_seq_target = split_history_target(val_poi_sequence)

In [36]:
unique_pois = { poi for sequence in  poi_sequence_dataset  for poi in sequence }

print("Total pois incluidos en dataset:", len(unique_pois))
print("Porcentaje de POIs que se usaran en el modelo",  len(unique_pois) * 100 / len(pois))

Total pois incluidos en dataset: 478639
Porcentaje de POIs que se usaran en el modelo 99.56524361958608


In [37]:
print("Total Train", len(train_seq_history))
print("Total Test ", len(test_seq_history))
print("Total Val  ", len(val_seq_history))

Total Train 414903
Total Test  51863
Total Val   51863


Ya tenemos nuestros datos listos para entrenar!

# Modelo

In [92]:
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size=None, emb_dim=None, hidden_dim=None, sample_length=None):
        super(EmbeddingModel, self).__init__()
        
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.hidden = nn.Linear(sample_length * emb_dim, hidden_dim)
        self.hidden_activation = nn.ReLU()
        self.output = nn.Linear(hidden_dim, vocab_size)
        self.output_activation = nn.LogSoftmax(dim=-1)

    def forward(self, xs):
        batch_size = xs.size()[0]

        # embed and merge
        xs = self.emb(xs)
        xs = torch.reshape(xs, (batch_size, -1))

        # hidden layer
        hidden = self.hidden(xs)
        hidden = self.hidden_activation(hidden)

        # output log probabilities
        output_logits = self.output(hidden)
        output_log_probs = self.output_activation(output_logits)
        
        return output_log_probs

    def predict(self, xs):
        return torch.argmax(self.forward(xs))

In [None]:
class Trainer:
    def __init__(self, model):
        

In [61]:
from torch.utils.data import DataLoader

def dataset_to_tensors(sequences, targets):
    return [[torch.tensor(x), torch.tensor(y)] for x, y in zip(sequences, targets)]


train_tensors = dataset_to_tensors(train_seq_history, train_seq_target)
train_dataloader = DataLoader(train_tensors, batch_size=BATCH_SIZE, shuffle=True)

test_tensors = dataset_to_tensors(test_seq_history, test_seq_target)
test_dataloader = DataLoader(test_tensors, batch_size=BATCH_SIZE, shuffle=True)

val_tensors = dataset_to_tensors(val_seq_history, val_seq_target)
val_dataloader = DataLoader(val_tensors, batch_size=BATCH_SIZE, shuffle=True)

In [108]:

# Globals
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
loss = nn.NLLLoss()

# Training Parameters
print_every = 200
test_every = 10000 # 2000
adjust_every = 15000 # 500
num_iterations = int(1e7)
epsilon_0 = 1e-3
r = 1e-8

# Tensorboard-related
# experiment_name = Path('act-{}_emb-{}_hidden-{}_sample-{}_batch-{}_optim-{}_lr-{}_adjust-every-{}'.format(
#     'relu',
#     emb_dim,
#     hidden_dim,
#     sample_length,
#     batch_size,
#     'sgd',
#     epsilon_0,
#     adjust_every
# ))
# experiment_path = Path('run') / experiment_name
# writer = SummaryWriter(experiment_path)

# Get the dataset

# Create the model
print("Initializing model ...")
lm = EmbeddingModel(
    vocab_size=len(pois),
    emb_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    sample_length=SEQUENCE_LENGTH - 1
)


lm.to(device)
optimizer = optim.SGD(lm.parameters(), lr=epsilon_0) # optim.Adam(lm.parameters(), amsgrad=True)

losses = []

train_iter = iter(train_dataloader)

print("Beginning training ...")
for i in range(num_iterations):    
    # Zero the parameter gradients
    optimizer.zero_grad()
    
    # Get the next batch
    xs, ys = next(train_iter)
    xs, ys = xs.to(device), ys.to(device)

    # Get the averaged batch loss
    output = loss(lm(xs), ys)

    # Run the backward pass (calculate gradients)
    output.backward()

    # Update the model
    optimizer.step()

    # Save and print statistics
    losses.append(output.item())

    if i % print_every == (print_every - 1):
        # Get average loss
        avg_loss = sum(losses) / len(losses)
        losses = []

        # Compute perplexity
        perp = math.exp(avg_loss)

        # Print and log the train loss
        print("({}) : {}".format((i + 1), perp))
        # writer.add_scalar('Training Loss', perp, i)

    if i % test_every == (test_every - 1):
        print("\nEvaluating model on test set ...")
        
        lm.eval()

        with torch.no_grad():
            test_iter = iter(test_dataloader)

            test_losses = []            
            for xs, ys in tqdm.tqdm(test_iter, total=len(test_iter)):
                xs, ys = xs.to(device), ys.to(device)
                output = loss(lm(xs), ys)
                test_losses.append(output.item())


        avg_test_loss = sum(test_losses) / len(test_losses)
        test_losses = []

        # Compute perplexity
        test_perp = math.exp(avg_test_loss)

        # Print and log the test loss
        print("Average Test Loss: {}\n".format(test_perp))
        # writer.add_scalar('Test Loss', test_perp, i)
                
        lm.train()

    # Update the learning rate every iteration, just as in the paper
    if i % adjust_every == (adjust_every - 1):
        for g in optimizer.param_groups:
            g['lr'] = epsilon_0 / (1 + r * (i / adjust_every))

Initializing model ...
Beginning training ...


KeyboardInterrupt: 