# Dependencies

In [None]:
!pip install binary gensim==3.6.0

# Libraries

In [1]:
import pandas as pd
import numpy as np
import csv
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from time import time  # To time our operations

#import logging  # Setting up the loggings to monitor gensim
#logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

## Pipeline Training

In [None]:
cores = multiprocessing.cpu_count()   # Number of CPU cores used for training
t = time()  # Time of the process

osm_tables = ['bins_polygons_information', 'bins_roads_information', 'bins_lines_information', 'bins_points_information']

for osm_table in osm_tables:
    for n in range(0, 9, 1):
        for w in np.arange(0.0, 1.1, 0.1):
            w = round(w, 1)
            print('Training model (tuple of weight):', w)

            # You must adjust the file name for binary relations generated with points
            file_name = './austin-sl-tuple-geoc2vec-' + str(n) + osm_table+ '-wgt'+str(w)+'pfp-c.csv'
            print("Loading file:", file_name)
            tuples = pd.read_csv(file_name)

            # Removing damaged rows
            tuples = tuples.dropna()

            # Using only the types of PoIs to create sentences in Word2Vec
            tuples = tuples[['center_poi', 'context_osm']]

            # Adapting to Word2Vec sentences
            sentencesTuples = tuples.values.tolist()

            # Creating skip-gram model
            p2v_modeltp = Word2Vec(min_count=5,
                                    window=1,
                                    sg=1,  # Skip-gram
                                    size=70,
                                    sample=6e-5, 
                                    alpha=0.03, 
                                    min_alpha=0.0007, 
                                    negative=20,
                                    workers=cores-1)

            # Building vocabulary
            p2v_modeltp.build_vocab(sentencesTuples, progress_per=10000)

            # Training the model
            p2v_modeltp.train(sentencesTuples, total_examples=p2v_modeltp.corpus_count, epochs=15, report_delay=1)

            # Saving to a file
            model_name = './austin-sl-tuple-geoc2vec-' + str(n) + osm_table+ '-wgt'+str(w)+'pfp-c.model'
            print('Saving file:', model_name)
            p2v_modeltp.save(model_name)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))        
print('Process finish.')

In [63]:
p2v_modeltp.wv.most_similar(positive=['Dentists'])

[('General Dentistry', 0.9591137766838074),
 ('Cosmetic Dentists', 0.9166595339775085),
 ('Orthodontists', 0.8655418753623962),
 ('Oral Surgeons', 0.8523178696632385),
 ('Hair Removal', 0.8509488701820374),
 ('Health & Medical', 0.823373556137085),
 ('Beauty & Spas', 0.8233371376991272),
 ('Pediatric Dentists', 0.806435227394104),
 ('Skin Care', 0.7864800691604614),
 ('Doctors', 0.768817126750946)]