-
Notifications
You must be signed in to change notification settings - Fork 14
/
utils.py
109 lines (88 loc) · 2.92 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import random
import numpy as np
import geopy
from geopy.distance import vincenty
import sys
def load_data(train_data_context_path, train_data_content_path):
"""
Data format:
train_context
<poi_1> \t <poi_2> \t ...
train_content
<poi_1> \t <word_1> <word_2> ... <word_20>
"""
train_context = [line.strip() for line in open(train_data_context_path).readlines()]
train_content = [line.strip() for line in open(train_data_content_path).readlines()]
return train_context, train_content
def indexing(_train_context, _train_content):
"""
In this function, all the words, users, and POIs are indexed.
And then, all the train, validation, and test data are converted by index.
"""
train_context = []
train_content = []
# POI Dictionary
poi2id = {"<PAD>":0}
id2poi = ["<PAD>"]
for line in _train_context:
tokens = line.split("\t")
for poi in tokens:
if poi not in poi2id:
poi2id[poi] = len(poi2id)
id2poi.append(poi)
train_context.append([poi2id[poi] for poi in tokens])
# Word Dictionary
word2id = {"<PAD>":0}
id2word = ["<PAD>"]
for line in _train_content:
poi, content = line.split("\t")
words = content.split()
for word in words:
if word not in word2id:
word2id[word] = len(word2id)
id2word.append(word)
if poi not in poi2id:
poi2id[poi] = len(poi2id)
id2poi.append(poi)
train_content.append((poi2id[poi], [word2id[word] for word in words]))
return poi2id, id2poi, word2id, id2word, train_context, train_content
def process_data(_train_context, _train_content, context_size, content_context_size):
train_context = []
for line in _train_context:
pois = [0]*context_size + line + [0]*context_size
for i in range(context_size, len(pois)-context_size):
context = pois[i-context_size:i] + pois[i+1:i+context_size+1]
for poi in context:
if poi != 0:
train_context.append((pois[i],poi))
train_content = []
for line in _train_content:
poi, words = line
for i in range(len(words)):
for j in range(len(words)):
if i != j:
train_content.append((poi, words[i], words[j]))
return train_context, train_content
def train_context_batches(train_context, batch_size):
batch_num = int(len(train_context)/batch_size) + 1
random.shuffle(train_context)
for i in range(batch_num):
left = i*batch_size
right = min(len(train_context), (i+1)*batch_size)
target, context = [], []
for line in train_context[left:right]:
target.append(line[0])
context.append(line[1])
yield target, context
def train_content_batches(train_content, batch_size):
batch_num = int(len(train_content)/batch_size) + 1
random.shuffle(train_content)
for i in range(batch_num):
left = i*batch_size
right = min(len(train_content), (i+1)*batch_size)
target_poi, target_word, context_word = [], [], []
for line in train_content[left:right]:
target_poi.append(line[0])
target_word.append(line[1])
context_word.append(line[2])
yield target_poi, target_word, context_word