# Dependencies

In [2]:
!pip install pyarrow
!pip install psycopg2-binary



# Libraries

In [1]:
import pandas as pd
import numpy as np
import psycopg2 as postgres
import math

# Utils

In [5]:
def connect():
    """
    Establishes a connection to a PostgreSQL database.

    Returns:
        psycopg2.extensions.connection: A connection object to the database.
    """
    conn = None
    try:
        conn = psycopg2.connect(
            host="localhost",
            database="austin_test",  # Database name
            user="postgres",         # Database username
            password="root"          # Database password
        )
    except psycopg2.Error as e:
        print(e)
    return conn

In [6]:
def closeConnection(conn):
    """
    Closes the connection to a PostgreSQL database.

    Args:
        conn (psycopg2.extensions.connection): A connection object to the database.

    Returns:
        bool: True if the connection was successfully closed, False otherwise.
    """
    success = False
    try:
        conn.close()
        success = True
    except psycopg2.Error as e:
        print(e)
    
    return success

In [7]:
def executeQuery(conn, sql):
    """
    Executes a SQL query on a PostgreSQL database.

    Args:
        conn (psycopg2.extensions.connection): A connection object to the database.
        sql (str): The SQL query to execute.

    Returns:
        list: A list of records (tuples) retrieved from the database.
    """
    record = None
    try:
        cur = conn.cursor()
        cur.execute(sql)
        record = cur.fetchall()
        cur.close()
    except psycopg2.Error as e:
        print(e)
        cur.execute("ROLLBACK")
        cur.close()
    return record

In [8]:
def executeInsert(conn, sql):
    """
    Executes an SQL INSERT statement on a PostgreSQL database.

    Args:
        conn (psycopg2.extensions.connection): A connection object to the database.
        sql (str): The SQL INSERT statement to execute.

    Returns:
        bool: True if the insertion was successful, False otherwise.
    """
    success = False
    try:
        cur = conn.cursor()
        cur.execute(sql)
        conn.commit()
        cur.close()
        success = True
    except psycopg2.Error as e:
        print(e)
        cur.execute("ROLLBACK")
        cur.close()

    return success

In [9]:
def getPOIInformation(conn, business_id):
    """
    Retrieves information about a Point of Interest (POI) based on its ID.

    Args:
        conn (psycopg2.extensions.connection): A connection object to the database.
        business_id (str): The unique ID of the POI.

    Returns:
        list: A list of tuples containing checkin count and name for the specified POI.
    """
     sql = """
        SELECT checkin_count, name FROM pois_information WHERE id  = \'"""+str(business_id)+ """\'
    ;"""

    result = executeQuery(conn, sql)
    return result

In [10]:
def getBinInformation(conn, business_id, bin_number):
    """
    Retrieves information about bins centered around a specific Point of Interest (POI).

    Args:
        conn (psycopg2.extensions.connection): A connection object to the database.
        business_id (str): The unique ID of the central POI.
        bin_number (int): The bin number associated with the central POI.

    Returns:
        list: A list of tuples containing information about bins related to the specified POI.
            Each tuple includes the following:
            - fk_poi_id_context: The ID of the context POI.
            - name: The name of the context POI.
            - checkin_count: The check-in count for the context POI.
            - distance_m: The distance between the central POI and the context POI (in meters).
    """
    sql = """
        SELECT fk_poi_id_context, name, checkin_count, distance_m 
        FROM bins_information 
        WHERE fk_poi_id_center = \'"""+str(business_id)+"""\' AND fk_bin_number = """+str(bin_number)+""";"""

    result = executeQuery(conn, sql)
    return result

In [13]:
def calculateBin(df, bin_number, w=0.5):
    """
    Calculates the ITDL (Information Theoretic Distance Learning) for a given bin.

    Args:
        df (pandas.DataFrame): A DataFrame containing information about points of interest (POIs).
        bin_number (int): The bin number for which to calculate ITDL.
        w (float, optional): Weight parameter (default is 0.5).

    Returns:
        None: The function prints intermediate results during execution.
    """
    print("executing bin:", bin_number, "\tweight:", w)

    # Establish a connection to the database
    connection = connect()

    if connection is not None:
        
        # Initialize the ITDL dictionary
        scITDL = {}
        i = 0

        # Iterate through central POIs
        for id_01, poi in df.iterrows():
            # Obtain information about categories and check-ins for the central POI
            poi_information = getPOIInformation(connection, poi['business_id'])

            # Obtain information about bins related to the central POI
            bin_information = getBinInformation(connection, poi['business_id'], bin_number)
            bin_information = pd.DataFrame(bin_information, columns=['business_id', 'category', 'checkin_count', 'distance_m'])

            # Calculate parameters
            sp = len(bin_information['business_id'].unique())  # Total number of POIs in the bin
            sc = bin_information.drop_duplicates(subset='business_id')['checkin_count'].sum()  # Total check-ins in the bin

            # Avoid division by zero
            if sc != 0 and sp != 0:
                for center_poi in poi_information:
                    for id_02, row in bin_information.iterrows():
                        cc = bin_information[bin_information['category'] == row['category']]['checkin_count'].sum()
                        cp = bin_information[bin_information['category'] == row['category']]['category'].count()

                        a = (1 - (cc / sc))
                        u = (cp/sp)       # May result in 0

                        if ((a > 0) and (u > 0)):
                        
                            A = -np.log2(a)
                            U = -np.log2(u)
                        
                            aug = int(math.ceil((w * A) + ((1 - w) * U)))
                        
                            # Increase it by factor b
                            for b in range(aug):
                        
                                scITDL[i] = {
                                    'poi_id_center': poi['business_id'],
                                    'center_poi': center_poi[1],
                                    'poi_id_context': row['business_id'],
                                    'context_poi': row['category'],
                                    'distance-m': row['distance_m']
                                }
                                i = i + 1
                        
                        scITDL = pd.DataFrame.from_dict(scITDL, 'index')
                        name = './austin-sl-tuple-n-itdl-' + str(bin_number) + 'bin-wgt' + str(w) + '-p.csv'
                        scITDL.to_csv(name, index=False)
                        closeConnection(connection)
        
    return None

In [1]:
import csv

def calculateBin_Disco(df, bin_number, w=0.5):
    """
    Calculates the ITDL (Information Theoretic Distance Learning) for a given bin and saves the results directly to a CSV file.

    Args:
        df (pandas.DataFrame): A DataFrame containing information about points of interest (POIs).
        bin_number (int): The bin number for which to calculate ITDL.
        w (float, optional): Weight parameter (default is 0.5).

    Returns:
        None: The function prints intermediate results during execution and saves the results to a CSV file.
    """
    print("executing bin:", bin_number, "\tweight:", w)

    # File to save results directly to disk
    name = './austin-sl-tuple-n-itdl-' + str(bin_number) + 'bin-wgt' + str(w) + '-p.csv'
    csv_file = open(name, "w", newline='')
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(["poi_id_center", "center_poi", "poi_id_context", "context_poi", "distance-m"])

    # Establish a connection to the database
    connection = connect()

    if connection is not None:
        # Iterate through central POIs
        for id_01, poi in df.iterrows():
            # Obtain information about categories and check-ins for the central POI
            poi_information = getPOIInformation(connection, poi['business_id'])

            # Obtain information about bins related to the central POI
            bin_information = getBinInformation(connection, poi['business_id'], bin_number)
            bin_information = pd.DataFrame(bin_information, columns=['business_id', 'category', 'checkin_count', 'distance_m'])

            # Calculate parameters
            sp = len(bin_information['business_id'].unique())  # Total number of POIs in the bin
            sc = bin_information.drop_duplicates(subset='business_id')['checkin_count'].sum()  # Total check-ins in the bin

            # Avoid division by zero
            if sc != 0 and sp != 0:
                for center_poi in poi_information:
                    for id_02, row in bin_information.iterrows():
                        cc = bin_information[bin_information['category'] == row['category']]['checkin_count'].sum()
                        cp = bin_information[bin_information['category'] == row['category']]['category'].count()

                        a = (1 - (cc/sc))  # May result in 0
                        u = (cp/sp)        # May result in 0
                        
                        if ((a > 0) and (u > 0)):
                        
                            A = -np.log2(a)
                            U = -np.log2(u)
                        
                            aug = int(math.ceil((w * A) + ((1 - w) * U)))
                        
                            # Increase it by factor b
                            for b in range(aug):
                                line = [str(poi['business_id']), str(center_poi[1]), str(row['business_id']), str(row['category']), str(row['distance_m'])]
                                writer.writerow(line)
        
        csv_file.close()
        closeConnection(connection)
        
    return None

# Binary Relations Generation

In [14]:
#POIs CSV Information
pois_file_name = './austin-ml-updated.csv'
df = pd.read_csv(pois_file_name)
print(df.shape)
df = df.dropna()
print(df.shape)
df.head()

(22399, 7)
(22399, 7)


Unnamed: 0,business_id,city,state,latitude,longitude,categories,checkin_count
0,N3_Gs3DnX4k9SgpwJxdEfw,Austin,TX,30.346169,-97.711458,"Shopping, Jewelry Repair, Appraisal Services, ...",14
1,tXvdYGvlEceDljN8gt2_3Q,Austin,TX,30.172706,-97.79992,"Barbers, Beauty & Spas",1
2,nTIhpR7MhsALPwg_Hh14EA,Austin,TX,30.326377,-97.704543,"Hotels, Hotels & Travel, Event Planning & Serv...",475
3,8XyEpVdAO0o6iVkVxkWosQ,Austin,TX,30.246465,-97.778738,"Home Services, Real Estate, Property Management",0
4,NVfOn7TdnHbaGH97CVB_Qg,Austin,TX,30.244902,-97.857409,"Chiropractors, Health & Medical",33


## iteractive ITDL

In [16]:
for bin in range(0, 1):
    calculateBin_Disco(df, bin, 0.5)

executing bin: 0 	weight: 0.5


## Parallel ITDL

In [None]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())


# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(int(mp.cpu_count()))

# Step 2: `pool.apply` the `howmany_within_range()`
bins = range(0, 5)

pool.starmap(calculateBin_Disco, [(df, bin_number, 0.5) for bin_number in bins])

# Step 3: Don't forget to close
pool.close()

print('Process finished.')

Number of processors:  16
executing bin:executing bin:  1211  	weight:	weight:  0.50.5

executing bin: 13 	weight: 0.5
executing bin: 14 	weight: 0.5
executing bin: 15 	weight: 0.5
executing bin: 16 	weight: 0.5
executing bin: 17 	weight: 0.5
executing bin: 18 	weight: 0.5
0.5executing bin: 19 	weight: 
executing bin: 20 	weight: 0.5
