#  Librerías

### Librerías pesadas
Para ejecutar solo una vez

In [1]:
import torch
import pandas as pd
import numpy as np

### Librerías livianas
Para ejecutar múltiples veces

# Configuración General

Variables relacionadas al procesamiento de datos y del modelo en sí

### Variables de Preprocesamiento

In [2]:

# Porcentaje para usar solo una fracción del dataset de usuario.
# si al eliminar usuarios quedan viajes o POI sin visitas, estos también
# serán eliminados
USER_FRAC = 0.2

In [3]:
MIN_POI_VISITS = 5

MAX_SEQUENCES_PER_USER = 200
SEQUENCE_LENGTH = 10


# Gowalla Dataset

In [4]:
! ./download-gowalla.sh

Already Downloaded


### Cargar Datos

In [5]:
users    = pd.read_csv('download/gowalla/gowalla_userinfo.csv')
friends  = pd.read_csv('download/gowalla/gowalla_friendship.csv')
checkins = pd.read_csv('download/gowalla/gowalla_checkins.csv')
pois_1   = pd.read_csv('download/gowalla/gowalla_spots_subset1.csv', encoding='iso-8859-1')
pois_2   = pd.read_csv('download/gowalla/gowalla_spots_subset2.csv', encoding='iso-8859-1')
pois     = pd.concat((pois_1, pois_2), ignore_index=True)


# Preprocesamiento

### Usuarios

Revisamos la distrubución de checkins de usuarios

In [6]:
users.sample(5)

Unnamed: 0,id,bookmarked_spots_count,challenge_pin_count,country_pin_count,highlights_count,items_count,photos_count,pins_count,province_pin_count,region_pin_count,state_pin_count,trips_count,friends_count,stamps_count,checkin_num,places_num
186414,948570,0,7,2,0,8,10,9,0,2,0,0,24,30,52,30
34975,76531,0,1,0,0,5,0,1,0,0,0,0,1,0,0,0
111471,242313,0,7,1,0,10,0,14,0,7,6,0,14,41,50,41
82903,172995,0,3,1,0,2,0,4,0,1,0,0,5,10,17,10
380039,2567257,0,8,1,0,4,5,12,0,4,3,0,3,22,30,23


In [7]:
users.checkin_num.describe()

count    407533.000000
mean         88.341212
std         435.982581
min           0.000000
25%           1.000000
50%          10.000000
75%          52.000000
max       46981.000000
Name: checkin_num, dtype: float64

nos quedamos con un porcentaje de los usuarios y los filtramos los usuarios segun los checkins que tengan

In [8]:
print('Current users', len(users))
users = users.sample(frac=USER_FRAC)
users = users[users.checkin_num >= SEQUENCE_LENGTH]
users = users[['id']]
print('Reduced users', len(users))

Current users 407533
Reduced users 40874


In [9]:
checkins[checkins.userid ==  69996]

Unnamed: 0,userid,placeid,datetime
4223526,69996,269432,2011-06-01T19:10:19Z
4223527,69996,61280,2011-05-31T20:35:45Z
4223528,69996,49812,2011-05-30T23:29:12Z
4223529,69996,205779,2011-05-30T23:28:53Z
4223530,69996,6511613,2011-05-30T23:28:25Z
...,...,...,...
4224301,69996,282847,2009-12-28T12:48:22Z
4224302,69996,117306,2009-12-28T12:45:56Z
4224303,69996,150947,2009-12-27T22:37:39Z
4224304,69996,261322,2009-12-27T22:27:05Z


### Amigos

In [10]:
friends.sample(5)

Unnamed: 0,userid1,userid2
262841,11610,250986
549920,27125,291754
2054561,195756,113228
938616,67105,67138
739466,49428,1934


### Checkins

In [11]:
checkins.sample(5)

Unnamed: 0,userid,placeid,datetime
13717398,194444,792340,2010-08-30T13:39:17Z
3222342,104900,267123,2010-02-03T23:39:40Z
12164383,56034,64821,2011-02-06T14:25:54Z
18120944,2082171,454291,2011-03-27T16:09:40Z
31477178,371762,384990,2011-02-07T02:05:19Z


Eliminamos los checkins de los usuarios no sampleados

In [12]:
print('Current checkins', len(checkins))
checkins = pd.merge(checkins, users, how='inner', left_on='userid', right_on='id', copy=False)[checkins.columns]
checkins = checkins.reset_index(drop=True)
print('Reduced checkins', len(checkins))

Current checkins 36001959
Reduced checkins 7146627


### POIS

In [13]:
pois.sample(5)

Unnamed: 0,id,created_at,lng,lat,photos_count,checkins_count,users_count,radius_meters,highlights_count,items_count,max_items_count,spot_categories,name,city_state,Unnamed: 5,Unnamed: 6
2552903,7324828,2011-05-03T08:04:46Z,17.874774,59.577125,0.0,1.0,1.0,100.0,0.0,0.0,10.0,"[{'url': '/categories/170', 'name': 'Warehouse...",,,,
931988,1056931,2010-05-03T04:02:41Z,114.151622,22.286545,0.0,8.0,7.0,75.0,0.0,0.0,10.0,"[{'url': '/categories/106', 'name': 'Grocery'}]",,,,
2322948,7024989,2011-03-05T19:56:39Z,-111.918709,33.372716,1.0,2.0,2.0,150.0,0.0,0.0,10.0,"[{'url': '/categories/38', 'name': 'City Park'}]",,,,
2338818,7045468,2011-03-09T20:18:03Z,-84.674167,45.027508,0.0,2.0,1.0,75.0,0.0,0.0,10.0,"[{'url': '/categories/239', 'name': 'Accessori...",,,,
872715,988962,2010-04-22T05:07:08Z,-93.431999,37.28646,0.0,2.0,2.0,75.0,0.0,0.0,10.0,"[{'url': '/categories/57', 'name': 'Other - Tr...",,,,


Filtramos los pois si han sido visitadas pocas veces según los parámetros que definimos

In [14]:
visited_pois = pd.merge(pois, checkins, left_on='id', right_on='placeid', how='inner', copy=False)

In [15]:
visited_pois['visited_count'] = np.zeros(len(visited_pois))

visited_pois = visited_pois[['id', 'visited_count']].groupby(by='id').count()
visited_pois = visited_pois[visited_pois.visited_count >= MIN_POI_VISITS]

pois = pd.merge(pois, visited_pois, on='id', how='inner', copy=False)

Nos quedamos con sólo las columnas que nos importan

In [16]:
pois = pois[['id', 'lat', 'lng', 'visited_count']]
pois 

Unnamed: 0,id,lat,lng,visited_count
0,8904,39.052318,-94.607499,8
1,8932,32.927662,-97.254356,7
2,8936,39.053318,-94.591995,16
3,8938,39.052824,-94.590311,84
4,8947,37.331880,-122.029631,820
...,...,...,...,...
325647,7520940,40.099642,-8.505145,5
325648,7523299,51.484481,-0.312275,5
325649,7527671,27.390431,-82.511700,5
325650,7605188,37.366290,-122.080366,9


Ahora eliminamos los checkins de pois que ya no existen

In [17]:
print('Current checkins', len(checkins))
checkins = pd.merge(pois, checkins, left_on='id', right_on='placeid', how='inner', copy=False)[checkins.columns]
checkins = checkins.reset_index(drop=True)
print('Reduced checkins', len(checkins))

Current checkins 7146627
Reduced checkins 5154379


Finalmente eliminamos nuevamente a los usuarios que se quedaron sin suficientes checkins

In [18]:
print('Current users', len(users))
users = pd.merge(checkins, users, how='inner', left_on='userid', right_on='id', copy=False)[users.columns].drop_duplicates()
print('Reduced users', len(users))

Current users 40874
Reduced users 40648


### Asignamos ids unicos desde 0 a todas las tablas

In [19]:
users = users.reset_index(drop=True)
users['user_sid'] = users.index

pois = pois.reset_index(drop=True)
pois['place_sid'] = pois.index

### Agregar Datos

Crearemos un dataset unificado que usaremos para entrenar el modelo de los embeddings

In [20]:
pois.head()

Unnamed: 0,id,lat,lng,visited_count,place_sid
0,8904,39.052318,-94.607499,8,0
1,8932,32.927662,-97.254356,7,1
2,8936,39.053318,-94.591995,16,2
3,8938,39.052824,-94.590311,84,3
4,8947,37.33188,-122.029631,820,4


In [21]:
users.head()

Unnamed: 0,id,user_sid
0,26,0
1,665,1
2,246,2
3,366614,3
4,108,4


In [22]:
checkins.head()

Unnamed: 0,userid,placeid,datetime
0,26,8904,2009-05-07T15:31:44Z
1,665,8904,2009-07-30T01:53:10Z
2,665,8904,2009-07-11T22:24:49Z
3,246,8904,2009-11-01T18:46:07Z
4,246,8904,2009-10-20T02:38:25Z


In [27]:
users_checkins = pd.merge(users, checkins, left_on='id', right_on='userid', copy=False).drop('id', axis=1)
users_checkins = pd.merge(users_checkins, pois[['id', 'place_sid']], left_on='placeid', right_on='id', copy=False)
users_checkins['date'] = pd.to_datetime(users_checkins['datetime'])
users_checkins = users_checkins.drop('datetime', axis=1)
users_checkins.sort_values(by=['user_sid', 'date'], inplace=True)

In [28]:
users_checkins[['user_sid', 'place_sid']].head(20)

Unnamed: 0,user_sid,place_sid
3154,0,188
3153,0,188
3152,0,188
3197,0,189
3196,0,189
3205,0,190
3204,0,190
3203,0,190
3213,0,193
3212,0,193


In [25]:
users_checkins[['user_sid', 'place_sid']].tail(20)

Unnamed: 0,user_sid,place_sid
5154318,40646,325583
5154259,40646,325582
5154258,40646,325582
5154257,40646,325582
5154366,40646,325589
5154317,40646,325583
5154256,40646,325582
5154316,40646,325583
5154255,40646,325582
5154315,40646,325583


Podemos ver que hay un problema: Muchas veces se repiten los POI consecutivos de un usuario, sin embargo eso no aporta mucha información al embedding, por lo que los eliminaremos en el siguiente paso

In [30]:
users_checkins['last_place_sid'] = users_checkins['place_sid'].shift(1)
user_checkins = users_checkins[users_checkins.place_sid != users_checkins.last_place_sid]
user_checkins = user_checkins.drop('last_place_sid', axis=1)

In [31]:
user_checkins.tail()

Unnamed: 0,user_sid,userid,placeid,id,place_sid,date
5154315,40646,2615815,7361973,7361973,325583,2011-07-02 11:38:43+00:00
5154254,40646,2615815,7361926,7361926,325582,2011-07-03 02:46:12+00:00
5154314,40646,2615815,7361973,7361973,325583,2011-07-03 10:53:14+00:00
5154253,40646,2615815,7361926,7361926,325582,2011-07-03 23:47:06+00:00
5154378,40647,2599407,7502407,7502407,325642,2011-06-13 12:33:52+00:00


In [32]:
from collections import defaultdict

In [33]:
user_poi_seq = defaultdict(lambda: [])
user_date_seq = defaultdict(lambda: [])

In [34]:
for user_sid, place_sid, date in zip(users_checkins['user_sid'], users_checkins['place_sid'], users_checkins['date']):
    user_poi_seq[user_sid].append(place_sid)
    user_date_seq[user_sid].append(date)

Generamos secuencias de puntos de interes visitados por usuarios de un largo predefinido para entrenar el modelo de embeddings

In [36]:
from random import sample

poi_sequence_dataset = []

for user_sid, sequence in user_poi_seq.items():
    if len(sequence) < SEQUENCE_LENGTH: continue

    candidate_indexes = list(range(0, len(sequence) - SEQUENCE_LENGTH, SEQUENCE_LENGTH))

    n_sequences = min(len(candidate_indexes), MAX_SEQUENCES_PER_USER)
    start_indexes = sample(candidate_indexes, n_sequences)

    for idx in start_indexes:
        new_seq = sequence[idx:idx + SEQUENCE_LENGTH]
        poi_sequence_dataset.append(new_seq) 
    

In [44]:
def split_list(input, frac=0.5):
    split_index = int(len(input) * frac)
    return input[:split_index], input[split_index:]

## Split de Datos

Realizamos separacion en train / test / split de 80 / 10 / 10

In [45]:
train_poi_sequence, rest = split_list(poi_sequence_dataset, 0.8)
test_poi_sequence, val_poi_sequence = split_list(rest)

In [46]:
def split_history_target(sequences):
    history = [ seq[:-1] for seq in sequences]
    targets = [ seq[-1] for seq in sequences]
    return history, targets

In [47]:
train_seq_history, train_seq_target = split_history_target(train_poi_sequence)
test_seq_history, test_seq_target = split_history_target(test_poi_sequence)
val_seq_history, val_seq_target = split_history_target(val_poi_sequence)

In [53]:
unique_pois = { poi for sequence in  poi_sequence_dataset  for poi in sequence }

print("Total pois incluidos en dataset:", len(unique_pois))
print("Porcentaje de POIs que se usaran en el modelo",  len(unique_pois) * 100 / len(pois))

Total pois incluidos en dataset: 324406
Porcentaje de POIs que se usaran en el modelo 99.61738297323524


In [55]:
print("Total Train", len(train_seq_history))
print("Total Test ", len(test_seq_history))
print("Total Val  ", len(val_seq_history))

Total Train 364602
Total Test  45575
Total Val   45576


Ya tenemos nuestros datos listos para entrenar!