#  Librerías

### Librerías pesadas
Para ejecutar solo una vez

In [21]:
import torch
import pandas as pd
import numpy as np

### Librerías livianas
Para ejecutar múltiples veces

# Gowalla Dataset

In [22]:
! ./download-gowalla.sh

Already Downloaded


### Cargar Datos

In [23]:
users    = pd.read_csv('download/gowalla/gowalla_userinfo.csv')
friends  = pd.read_csv('download/gowalla/gowalla_friendship.csv')
checkins = pd.read_csv('download/gowalla/gowalla_checkins.csv')
pois_1   = pd.read_csv('download/gowalla/gowalla_spots_subset1.csv', encoding='iso-8859-1')
pois_2   = pd.read_csv('download/gowalla/gowalla_spots_subset2.csv', encoding='iso-8859-1')
pois     = pd.concat((pois_1, pois_2), ignore_index=True)


## Limpiar Datos

In [24]:
users.sample(5)

Unnamed: 0,id,bookmarked_spots_count,challenge_pin_count,country_pin_count,highlights_count,items_count,photos_count,pins_count,province_pin_count,region_pin_count,state_pin_count,trips_count,friends_count,stamps_count,checkin_num,places_num
2100,3920,0,1,3,0,6,0,4,0,3,0,0,19,3,4,3
202482,1440236,0,5,1,0,4,1,6,0,1,0,0,6,27,34,27
171850,496925,1,1,0,1,3,0,1,0,0,0,1,8,0,0,0
189327,1034316,0,3,1,0,2,0,5,0,2,1,0,14,12,12,12
43067,91361,0,1,1,0,6,0,2,0,1,0,0,45,1,1,1


In [25]:
users = users[['id']]

In [26]:
friends.sample(5)

Unnamed: 0,userid1,userid2
3298653,1491154,2512993
2639042,290177,244335
1516833,2223743,1045591
386752,17385,54290
3791490,674283,2447831


In [27]:
checkins.sample(5)

Unnamed: 0,userid,placeid,datetime
12864224,7710,39590,2009-12-27T02:36:09Z
10301539,4696,44850,2010-01-04T01:41:48Z
1064212,1173,32752,2011-06-17T20:37:16Z
2186108,79509,667397,2010-09-22T15:20:57Z
6795026,1315453,4523219,2010-10-26T10:32:26Z


In [28]:
pois.sample(5)

Unnamed: 0,id,created_at,lng,lat,photos_count,checkins_count,users_count,radius_meters,highlights_count,items_count,max_items_count,spot_categories,name,city_state,Unnamed: 5,Unnamed: 6
1576636,3840367,2010-09-16T19:03:44Z,-118.397191,34.067259,0.0,1.0,1.0,75.0,0.0,0.0,10.0,"[{'url': '/categories/366', 'name': 'Wedding'}]",,,,
229706,258113,2009-12-23T16:50:36Z,-122.347295,37.578458,1.0,21.0,8.0,75.0,0.0,1.0,10.0,"[{'url': '/categories/117', 'name': 'Salon & B...",,,,
1827170,6406215,2010-11-10T09:52:58Z,39.172296,21.559358,4.0,48.0,43.0,75.0,0.0,2.0,10.0,"[{'url': '/categories/36', 'name': 'Technology'}]",,,,
2496099,7249934,2011-04-18T14:19:04Z,114.959955,4.96058,0.0,2.0,1.0,35.0,0.0,0.0,10.0,"[{'url': '/categories/89', 'name': 'Craftsman'}]",,,,
2263441,6948667,2011-02-19T12:59:31Z,139.013055,34.753333,1.0,2.0,1.0,100.0,0.0,0.0,10.0,"[{'url': '/categories/48', 'name': 'Resort'}]",,,,


In [29]:
pois = pois[['id', 'lat', 'lng']]

### Samplear Datos para Ahorrar Memoria

In [30]:
USER_FRAC = 0.2
print('Current users', len(users))
users = users.sample(frac=USER_FRAC).reset_index(drop=True)
print('Reduced users', len(users))

# Eliminamos los checkins de los usuarios borrados al hacer un inner join
print('Current checkins', len(checkins))
checkins = pd.merge(checkins, users, how='inner', left_on='userid', right_on='id', copy=False)[checkins.columns]
checkins = checkins.reset_index(drop=True)
print('Reduced checkins', len(checkins))

# Eliminamos los POI que no se encuentran en ningún checkin
print('Current POIs', len(pois))
pois = pd.merge(pois, checkins[['placeid']], how='inner', left_on='id', right_on='placeid', copy=False).drop_duplicates()[pois.columns]
pois = pois.reset_index(drop=True)
print('Reduced POIs', len(pois))

Current users 407533
Reduced users 81507
Current checkins 36001959
Reduced checkins 7116275
Current POIs 2845888
Reduced POIs 1448301


### Agregar Datos

Agregamos ids secuenciales a los puntos de interes. Serán usados como sus ids únicos

In [31]:
pois['place_sid'] = pois.index

In [32]:
pois.head()

Unnamed: 0,id,lat,lng,place_sid
0,8904,39.052318,-94.607499,0
1,8932,32.927662,-97.254356,1
2,8936,39.053318,-94.591995,2
3,8938,39.052824,-94.590311,3
4,8947,37.33188,-122.029631,4


In [33]:
users.head()

Unnamed: 0,id
0,2481254
1,2666671
2,203535
3,2290244
4,319470


In [34]:
users['user_sid'] = users.index 

In [35]:
checkins = pd.merge(pois, checkins, left_on='id', right_on='placeid', copy=False).drop('id', axis=1)

In [36]:
users_checkins = pd.merge(users, checkins, left_on='id', right_on='userid', copy=False).drop('id', axis=1)

In [37]:
users_checkins['date'] = pd.to_datetime(users_checkins['datetime'])
users_checkins = users_checkins.drop('datetime', axis=1)
users_checkins.sort_values(by=['user_sid', 'date'], inplace=True)

In [40]:
from collections import defaultdict

In [47]:
user_poi_seq = defaultdict(lambda: [])
user_date_seq = defaultdict(lambda: [])

In [48]:
for user_sid, place_sid, date in zip(users_checkins['user_sid'], users_checkins['place_sid'], users_checkins['date']):
    user_poi_seq[user_sid].append(place_sid)
    user_date_seq[user_sid].append(date)

In [54]:
sizes = pd.DataFrame([len(seq) for seq in user_poi_seq.values()])

In [64]:
from random import sample

Generamos secuencias de puntos de interes visitados por usuarios de un largo predefinido para entrenar el modelo de embeddings

In [104]:
MAX_SEQUENCES_PER_USER = 200
SEQUENCE_LENGTH = 12

poi_sequence_dataset = []

for user_sid, sequence in user_poi_seq.items():
    if len(sequence) < SEQUENCE_LENGTH: continue

    candidate_indexes = list(range(0, len(sequence) - SEQUENCE_LENGTH, SEQUENCE_LENGTH))

    n_sequences = min(len(candidate_indexes), MAX_SEQUENCES_PER_USER)
    start_indexes = sample(candidate_indexes, n_sequences)

    for idx in start_indexes:
        new_seq = sequence[idx:idx + SEQUENCE_LENGTH]
        poi_sequence_dataset.append(new_seq) 
    

In [105]:
def split_list(input, frac=0.5):
    split_index = int(len(input) * frac)
    return input[:split_index], input[split_index:]

Realizamos separacion en train / test / split de 80 / 10 / 10

In [106]:
train_poi_sequence, rest = split_list(poi_sequence_dataset, 0.8)
test_poi_sequence, val_poi_sequence = split_list(rest)

In [108]:
def split_history_target(sequences):
    history = [ seq[:-1] for seq in sequences]
    targets = [ seq[-1] for seq in sequences]
    return history, targets

In [109]:
train_seq_history, train_seq_target = split_history_target(train_poi_sequence)
test_seq_history, test_seq_target = split_history_target(test_poi_sequence)
val_seq_history, val_seq_target = split_history_target(val_poi_sequence)

In [111]:
q = { poi for sequence in  poi_sequence_dataset  for poi in sequence }