# Libraries

In [3]:
import pandas as pd
import numpy as np
import csv
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from time import time  # To time our operations

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# Training Pipeline

In [None]:
cores = multiprocessing.cpu_count()   # Number of CPU cores used for training
t = time()  # Time of the process

# Looping through different window sizes
for wnd in range(10, 100, 10):
    print('Training model k:', wnd)

    file_name = './austin-shortest_path_corpus-zoning-k' + str(wnd) + '-FT.csv'

    print("Loading file:", file_name)
    tuples = pd.read_parquet(file_name)
    # Removing damaged rows
    tuples = tuples.dropna()
    tuples = tuples[['center_poi', 'context_poi']]
    sentences = tuples.values.tolist()

    # Creating skip-gram model
    p2v_modeltp = Word2Vec(min_count=1,
                           window=1,
                           sg=1,  # CBOW = 0, SKIPGRAM = 1
                           size=70,
                           sample=6e-5,
                           alpha=0.03,
                           min_alpha=0.0007,
                           negative=20,
                           workers=cores-1)

    # Building vocabulary
    p2v_modeltp.build_vocab(sentences, progress_per=10000)

    # Training the model
    p2v_modeltp.train(sentences, total_examples=p2v_modeltp.corpus_count, epochs=20, report_delay=1)

    model_name = './austin-shortest_path_corpus-zoning-k' + str(wnd) + '-FT.model'

    print('Saving file:', model_name)
    p2v_modeltp.save(model_name)

In [8]:
p2v_modeltp.wv.most_similar(positive=['Zoos'])

INFO - 21:36:36: precomputing L2-norms of word weight vectors


[('Gutter Services', 0.9558095932006836),
 ('Plumbing', 0.9509575366973877),
 ('Septic Services', 0.949760913848877),
 ('Churches', 0.9384828805923462),
 ('Flea Markets', 0.932166576385498),
 ('Marinas', 0.9298552870750427),
 ('Water Heater Installation/Repair', 0.9289876818656921),
 ('Bus Rental', 0.925334095954895),
 ('Amusement Parks', 0.9248497486114502),
 ('Hydro-jetting', 0.9221491813659668)]