# Libraries

In [2]:
import pandas as pd
import numpy as np
import psycopg2 as postgres
import psycopg2.extras
import csv
import random
from sshtunnel import SSHTunnelForwarder
from tqdm import tqdm
import ast

# Util Functions

In [3]:
def executeQuery(conn, sql):
    """
    Executes a SQL query on a PostgreSQL database.

    Args:
        conn (psycopg2.extensions.connection): A connection object to the database.
        sql (str): The SQL query to execute.

    Returns:
        list: A list of records (tuples) retrieved from the database.
    """
    record = None
    try:
        cur = conn.cursor()
        cur.execute(sql)
        record = cur.fetchall()
        cur.close()
    except psycopg2.Error as e:
        print(e)
        cur.execute("ROLLBACK")
        cur.close()
    return record

In [4]:
def getPOIInformation(conn, business_id):
    """
    Retrieves information about a Point of Interest (POI) based on its ID.

    Args:
        conn (psycopg2.extensions.connection): A connection object to the database.
        business_id (str): The unique ID of the POI.

    Returns:
        list: A list of tuples containing checkin count and name for the specified POI.
    """
     sql = """
        SELECT checkin_count, name FROM pois_information WHERE id  = \'"""+str(business_id)+ """\'
    ;"""

    result = executeQuery(conn, sql)
    return result

In [5]:
# Generates the shortest path between the two most distant POIs in a zone.

def generate_path_corpus_all(bin_number, size):
    """
    Generates the shortest path between the two most distant POIs in a zone.

    Args:
        bin_number (int): The bin number.
        size (int): The size of the window around the POI.

    Returns:
        list: The corpus containing the shortest path sentences.
    """
    wnd = (bin_number + 1) * size

    with SSHTunnelForwarder(
        ('150.165.15.171', 23456),
        ssh_username="salatiel",
        ssh_password="Salatiel@Lacina123",
        remote_bind_address=('localhost', 5432)) as server:

            server.start()
    
            params = {'database': 'austin_test',
                      'user': 'salatiel',
                      'password': 'root',
                      'host': 'localhost',
                      'port': server.local_bind_port
                      }
    
            connection = psycopg2.connect(**params)
    
            file_name = './geographic/SHORTEST PATH/austin-shortest_path-' + str(bin_number) + '-FT-c_gpu.csv'
    
            print("Loading file:", file_name)
            all_pois_path = []
            with open(file_name, newline='') as f:
                csv_reader = csv.reader(f)
                for row in csv_reader:
                    all_pois_path.append(row)
    
            file_name_corpus = './geographic/SHORTEST PATH CORPUS/austin-shortest_path_corpus-' + str(bin_number) + '-FT-c_gpu.csv'
    
            csv_file = open(file_name_corpus, "w", newline='')
            writer = csv.writer(csv_file, delimiter=',')
            writer.writerow(["poi_id_center",
                             "center_poi",
                             "center_poi_level",
                             "context_poi",
                             "context_poi_level"])
    
            for poi_path in tqdm(all_pois_path):
                sub_path = list(range(0, wnd + 1))
                sub_path_types = []

                # Filling the initial list
                for sb_idx in sub_path:
                    if sb_idx < len(poi_path):
                        poi_id = poi_path[sb_idx]
                        poi_information = getPOIInformation(connection, poi_id)
                        if len(poi_information) > 0:
                            columns = list(dict(poi_information[0]).keys())
                            poi_information = pd.DataFrame(poi_information, columns=columns)
                            sub_path_types.append(poi_information)
                
                # Initiating the association process
                for i in range(len(poi_path)):
                    # Binary associations
                    # sub_path[0] is the target POI
                    target_types = sub_path_types[0]
                    for j in range(1, len(sub_path_types)):
                        neighbor_types = sub_path_types[j]
                        for id_t, tg in target_types.iterrows():
                            for id_n, ng in neighbor_types.iterrows():
                                if tg['name'] != ng['name']:
                                    line = [str(tg['id']),
                                            str(tg['name']),
                                            str(tg['level']),
                                            str(ng['name']),
                                            str(ng['level'])]
                                    writer.writerow(line)
                
                    new_poi = i + (wnd) + 1
                    sub_path.pop(0)
                    sub_path.append(new_poi)
                    sub_path_types.pop(0)
                
                    if new_poi < len(poi_path):

                        poi_id = poi_path[new_poi]
                        poi_information = getPOIInformation(connection, poi_id)

                        if (len(poi_information) > 0):

                            columns = list(dict(poi_information[0]).keys())
                            poi_information = pd.DataFrame(poi_information, columns=columns)
                            sub_path_types.append(poi_information)

            connection.close()
            csv_file.close()
            
    return None    

In [11]:
def generate_path_corpus_zoning_all(all_pois_path, size):
    """
    Generates a corpus of sentences related to the shortest path between the two most distant POIs in a zone.

    Args:
        all_pois_path (list): A list of paths containing points of interest (POIs).
        size (int): The size of the window around the POI.

    Returns:
        list: The corpus containing sentences related to the shortest path.
    """
    wnd = size

    with SSHTunnelForwarder(
        ('150.165.15.171', 23456),
        ssh_username="salatiel",
        ssh_password="Salatiel@Lacina123",
        remote_bind_address=('localhost', 5432)) as server:

            server.start()
    
            params = {'database': 'austin_test',
                      'user': 'salatiel',
                      'password': 'root',
                      'host': 'localhost',
                      'port': server.local_bind_port
                      }
    
            connection = psycopg2.connect(**params)
    
            file_name_corpus = './austin-shortest_path_corpus-zoning-k' + str(wnd) + '-FT.csv'
    
            csv_file = open(file_name_corpus, "w", newline='')
            writer = csv.writer(csv_file, delimiter=',')
            writer.writerow(["poi_id_center",
                             "center_poi",
                             "center_poi_level",
                             "context_poi",
                             "context_poi_level"])
    
            for poi_path in tqdm(all_pois_path):
                sub_path = list(range(0, wnd + 1))
                sub_path_types = []
    
                # Fill initial list
                for sb_idx in sub_path:
                    if sb_idx < len(poi_path):
                        poi_id = poi_path[sb_idx]
                        poi_information = getPOIInformation(connection, poi_id)
                        if len(poi_information) > 0:
                            columns = list(dict(poi_information[0]).keys())
                            poi_information = pd.DataFrame(poi_information, columns=columns)
                            sub_path_types.append(poi_information)

                # Initiating the association process
                for i in range(len(poi_path)):
                    # Binary associations
                    # sub_path[0] is the target POI
                    target_types = sub_path_types[0]
                    for j in range(1, len(sub_path_types)):
                        neighbor_types = sub_path_types[j]
                        for id_t, tg in target_types.iterrows():
                            for id_n, ng in neighbor_types.iterrows():
                                if tg['name'] != ng['name']:
                                    line = [str(tg['id']),
                                            str(tg['name']),
                                            str(tg['level']),
                                            str(ng['name']),
                                            str(ng['level'])]
                                    writer.writerow(line)
                
                    new_poi = i + (wnd) + 1
                    sub_path.pop(0)
                    sub_path.append(new_poi)
                    sub_path_types.pop(0)
                
                    if new_poi < len(poi_path):
                        poi_id = poi_path[new_poi]
                        poi_information = getPOIInformation(connection, poi_id)
                        if len(poi_information) > 0:
                            columns = list(dict(poi_information[0]).keys())
                            poi_information = pd.DataFrame(poi_information, columns=columns)
                            sub_path_types.append(poi_information)

            connection.close()
            csv_file.close()
            
    return None    

# Data Reading for Shortest Path Corpus Generation

In [9]:
#Reading Shortest Path generated with Zoning-Oriented Shortest Path Algorithm-postgres notebook
file_name = './austin-shortest_path-zoning-FT.csv'

print("loading file:", file_name)
all_pois_path = []
with open(file_name, newline='') as f:
    csv_reader = csv.reader(f)
    for row in csv_reader:
        all_pois_path.append(row)
        
all_pois_path[0]

loading file: ./geographic/SHORTEST PATH/austin-shortest_path-zoning-FT_gpu.csv


['q5Px_3plGESL2Bkdh2C42g',
 'xRFcUSm2FyBNfF6hVoRB5A',
 '6-HNU_8lBVytuVDAY-yIrw',
 'tkyXRqZ4r0iE4xv7J2CBzA',
 'ReoZ7TjusWClqEJqQEKT-w',
 'SYfE-YgxaNp1Xg462NMjEw',
 'b7ilYKdaFrxqmdRhq0IofA',
 'PSYKGYT1y5jwvRYkjIC1Pw',
 '5PhOJlCPHGjBEIQ7WLbRJw',
 'aKBJN4P5mtmyycD04FAi0g',
 'WM3DlKcaLLe8LSE7Oc33Tg',
 '6WMNSsKE9iwENGGpRCBcMQ',
 '9CLW_-8crptx7NJ2EDFO3Q',
 '2Mf-GisSSU58WdEB7AW_hA',
 '45ibpsh1mIwF6gZBycHOUg',
 'JNjt0fgRmwVwyfdN0YvZfw',
 'EBB3vTWxFG7ElJBc9IQCFQ',
 'x7ogrkbPvQn2OdhFQC6DAQ',
 'qlfFcoysbHe20OWXmDB1JQ',
 '6xLrSOG-nfWZ4iwKspf4OA',
 'yV1NfpKxiJMMrB4iyeaweA',
 't-_Z9w96IPmTsP6XkYc3Rw',
 'dxcHl17pVjen-SiHDeBiYw',
 'c8yJlUCueOUXV7driq31pw',
 'cn5jVlPb85Yt7ie0JnfW9w',
 'nkxjj0JMBU3bCadjgpPoLQ',
 'uVJ_vquU1PHXEHoZ9wYICw',
 'MgPvLXqkgLomCClYwHy2Dw',
 'r9P2d6A5UBoiMe4fYtlgQg',
 'IYzIsCpu9zK3fs1nz3UgEA',
 '6ULOwdJbJPlmNuHsmRCURA',
 '2xjEoFTFjo-xQlcLdW-Jcg',
 'HylKM5pURvTgK51qSHlrKA',
 'htpL7K2x19BdhJsAE9lWYA',
 'fLQCXGls7SeS5UgGKALoDA',
 'D0gVLzoHCDwXw6BNRYOScw',
 '-0pSig1ffET5-fr3p4LkWg',
 

# Word2Vec Sentence Generation Process

In [11]:
#Produces sentences by randomly selecting a type of POI from the shortest path.
for rnd in range(0, 12):
    with SSHTunnelForwarder(
        ('localhost', 22),
        # ssh_private_key="</path/to/private/ssh/key>",
        ### in my case, I used a password instead of a private key
        ssh_username="root",
        ssh_password="root",
        remote_bind_address=('localhost', 5432)
    ) as server:
        server.start()
        # print("server connected")

        params = {
            'database': 'austin_test',
            'user': 'postgres',
            'password': 'root',
            'host': 'localhost',
            'port': server.local_bind_port
        }

        connection = psycopg2.connect(**params)

        all_sentences = []

        for poi_path in tqdm(all_pois_path):
            # for poi_path in all_pois_path:
            poi_path_sentence = []
            for poi_id in poi_path:
                # Obtaining category and check-in information for the central POI
                # [business_id, checkin, category]
                poi_information = getPOIInformation(connection, poi_id)

                if len(poi_information) > 0:
                    columns = list(dict(poi_information[0]).keys())
                    poi_information = pd.DataFrame(poi_information, columns=columns)

                    type_by_level = []

                    for level in range(5):
                        level_poi_information = poi_information[poi_information['level'] == level]
                        poi_types = list(level_poi_information['name'])
                        if len(poi_types) > 0:
                            poi_type = poi_types[random.randint(0, len(poi_types) - 1)]
                            type_by_level.append(poi_type)

                    # Select randomly from all types
                    selected_type = type_by_level[random.randint(0, len(type_by_level) - 1)]
                    poi_path_sentence.append(selected_type)


            all_sentences.append(poi_path_sentence)
            
    file_name = './austin-shortest_path_corpus-' + str(bin_number) + '-rnd'+str(rnd)+'-FT-c.csv'

    csv_file_02 = open(file_name, "w", newline='')
    writer_path = csv.writer(csv_file_02, delimiter=',')
    for st in all_sentences:
        writer_path.writerow(st)

100%|██████████| 22399/22399 [20:30<00:00, 18.20it/s]
100%|██████████| 22399/22399 [20:25<00:00, 18.28it/s]
100%|██████████| 22399/22399 [20:33<00:00, 18.16it/s]
100%|██████████| 22399/22399 [20:29<00:00, 18.22it/s]
 65%|██████▍   | 14473/22399 [13:19<08:16, 15.97it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 72%|███████▏  | 16060/22399 [14:46<08:43, 12.10it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Parallel Execution

In [7]:
pois_file_name = './austin-ml-updated.csv'
poi_df = pd.read_csv(pois_file_name)
print(poi_df.shape)
poi_df = poi_df.dropna()
print(poi_df.shape)
poi_df.head()

(22399, 7)
(22399, 7)


Unnamed: 0,business_id,city,state,latitude,longitude,categories,checkin_count
0,N3_Gs3DnX4k9SgpwJxdEfw,Austin,TX,30.346169,-97.711458,"Shopping, Jewelry Repair, Appraisal Services, ...",14
1,tXvdYGvlEceDljN8gt2_3Q,Austin,TX,30.172706,-97.79992,"Barbers, Beauty & Spas",1
2,nTIhpR7MhsALPwg_Hh14EA,Austin,TX,30.326377,-97.704543,"Hotels, Hotels & Travel, Event Planning & Serv...",475
3,8XyEpVdAO0o6iVkVxkWosQ,Austin,TX,30.246465,-97.778738,"Home Services, Real Estate, Property Management",0
4,NVfOn7TdnHbaGH97CVB_Qg,Austin,TX,30.244902,-97.857409,"Chiropractors, Health & Medical",33


In [12]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(int(mp.cpu_count()))


# Step 2: `pool.apply` the `howmany_within_range()`
size = [10, 20, 30, 40, 50, 60, 70, 80, 90]
pool.starmap(generate_path_corpus_zoning_all, [(all_pois_path, sz) for sz in size])

# Step 3: Don't forget to close
pool.close()

#Para point
#calculateBinOSMPoints_Disco(df, n)

Number of processors:  20


100%|██████████| 105/105 [05:17<00:00,  3.03s/it]]
100%|██████████| 105/105 [08:48<00:00,  5.03s/it]t]
100%|██████████| 105/105 [11:58<00:00,  6.85s/it] ]
100%|██████████| 105/105 [15:09<00:00,  8.67s/it]  
100%|██████████| 105/105 [18:14<00:00, 10.42s/it]  
100%|██████████| 105/105 [21:30<00:00, 12.29s/it]  
100%|██████████| 105/105 [23:58<00:00, 13.70s/it]
100%|██████████| 105/105 [26:38<00:00, 15.23s/it] 
100%|██████████| 105/105 [29:06<00:00, 16.63s/it]
