## AI 322 Reinforcement Learning Mini-Project

### Notebook 01 - Data Collection

Submission by: Rossjyn Fallorina

**Note: As the work was all done in the Google Colab platform, all files (data, notebooks, models, etc.) were stored in Google Drive. Some paths in this notebook may not work, since they point to the Google Drive subdirectory in which they are stored.**

Run installations below when running on Google Colab

In [None]:
# !pip install geopy
# !pip install openai
# !pip install scikit-learn
# !pip install datasets

### Import libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
import os
import time

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from datasets import Dataset

from openai import OpenAI

from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError

from google.colab import userdata

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Helper Functions

In [None]:
def generate_list_responses(prompt, num_queries):
    responses = []
    for _ in range(0, num_queries):
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="gpt-3.5-turbo",
        )

        response_string = chat_completion.choices[0].message.content

        # Split the string into lines
        lines = response_string.split('\n')

        # Use list comprehension to process each line, removing leading numbers and whitespace
        items = [line.split('. ', 1)[1].strip() for line in lines]

        responses.extend(items)
        time.sleep(3)  # Respect API rate limits

    return responses

In [None]:
def get_coordinates(address, user_agent="find-address-coordinates", delay=1):
    geolocator = Nominatim(user_agent=user_agent)

    try:
        location = geolocator.geocode(address, timeout=10)
        if location:
            return (location.latitude, location.longitude, location.address)
        else:
            print("Address not found")
            return -998, -998, "Address not found"
    except GeocoderTimedOut:
        print("Geocoding service timed out, retrying...")
        time.sleep(delay)
        return get_coordinates(address, user_agent, delay)
    except GeocoderServiceError as e:
        print(f"Geocoding service error: {e}")
        return -998, -998, "Error"

In [None]:
def process_and_geolocate_address(address_string):

    prompt_clean_address = f"Process this address string structure it cleanly. Remove specific information, such as block, house, and lot numbers. Try to spell out words and not shorten anything. You may occasionally remove information. If you are unsure of certain information, do not make guesses: {address_string}"

    chat_completion = client.chat.completions.create(
      messages=[
          {"role": "system", "content": "Your only goal is to process the address string as instructed. Provide the response only and nothing else. Be very concise but correct."},
          {"role": "user", "content": prompt_clean_address}
      ],
      model="gpt-3.5-turbo",
    )

    processed_string = chat_completion.choices[0].message.content

    time.sleep(3)  # Respect API rate limits

    geo_lat, geo_long, geo_rev_add = get_coordinates(processed_string)

    return processed_string, geo_lat, geo_long, geo_rev_add

### A. Generate address string data

In [None]:
# Set your API key (this can only be accessed as a Secrets key in Google Colab)
secret_key = userdata.get('openai_ai322_key')

# This is the default and can be omitted
client = OpenAI(api_key=secret_key)

#### A-1. Generate AI address strings

In [None]:
prompt_with_noise = "Generate a list of 10 unique residential or commercial address strings in the Philippines with slight formatting variations. Make sure that the addresses are as much as possible real. Inject noise into the address strings by randomly abbreviating common words, removing punctuations, and/or increasing or decreasing the level of granularity/detail in the address. Make sure that as much as possible, barangay and city level information is retained."
prompt_clean = "Generate a list of 10 unique residential or commercial address strings in the Philippines. Make sure that the addresses are as much as possible real. Make sure that as much as possible, barangay and city level information is retained."

def generate_addresses(prompt, num_addresses):
    addresses = []
    for _ in range(0, num_addresses):
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="gpt-3.5-turbo",
        )

        response_string = chat_completion.choices[0].message.content

        # Split the string into lines
        lines = response_string.split('\n')

        # Use list comprehension to process each line, removing leading numbers and whitespace
        items = [line.split('. ', 1)[1].strip() for line in lines]

        addresses.extend(items)
        time.sleep(3)  # Respect API rate limits
    return addresses

In [None]:
num_addresses = 300
clean_addresses = generate_addresses(prompt, num_addresses)

df_address_ai_generated = pd.DataFrame(clean_addresses, columns=["address_ai_generated"])
df_address_ai_generated.head()

In [None]:
df_address_ai_generated.to_csv(f'/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_ai_generated.csv', index=False)

#### A-2. Geolocate AI address strings

In [None]:
df_read_ai_address = pd.read_csv('/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_ai_generated.csv')

df_read_ai_address.head()

Unnamed: 0,address_ai_generated
0,"25 Sampaguita St., Greenheights Subdivision, B..."
1,"Block 2 Lot 3, Acacia Estates, Barangay Ususan..."
2,"Unit 301, Azure Urban Resort Residences, SLEX ..."
3,"17 Orchid St., Camella Homes, Barangay Lalakay..."
4,"Lot 5 Block 2, Villa de Primarosa Subdivision,..."


In [None]:
df_address_ai_generated = pd.DataFrame({
    "category": ["test"],
    "address_raw": ["test"],
    "address_processed": ["test"],
    "address_reversed": ["test"],
    "longitude": [np.nan],
    "latitude": [np.nan]
})

df_address_ai_generated

Unnamed: 0,category,address_raw,address_processed,address_reversed,longitude,latitude
0,test,test,test,test,,


In [None]:
# df_remaining = pd.read_csv('/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_ai_generated_remaining.csv')
# ai_address_list = list(df_remaining["Remaining Addresses"])
# progress_counter = 2995

for address in ai_address_list:

    list_category = []
    list_raw = []
    list_processed = []
    list_rev = []
    list_long = []
    list_lat = []

    for _ in range(5):
        try_processed, try_lat, try_long, try_rev_add = process_and_geolocate_address(address)

        list_category.append("AI-generated Address")
        list_raw.append(address)
        list_processed.append(try_processed)
        list_rev.append(try_rev_add)
        list_long.append(try_long)
        list_lat.append(try_lat)

    df_address_ai_generated_inc = pd.DataFrame({
      "category": list_category,
      "address_raw": list_raw,
      "address_processed": list_processed,
      "address_reversed": list_rev,
      "longitude": list_long,
      "latitude": list_lat
    })

    df_address_ai_generated = pd.concat([df_address_ai_generated, df_address_ai_generated_inc], ignore_index=True)
    df_address_ai_generated.to_csv('/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_ai_generated_processed.csv', index=False)

    del ai_address_list[0]

    df_remaining = pd.DataFrame({
      "Remaining Addresses":ai_address_list
    })

    df_remaining.to_csv('/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_ai_generated_remaining.csv', index=False)

    progress_counter += 1
    print(f"Processed AI-generated address string #{progress_counter}!")

Address not found
Address not found
Address not found
Address not found
Address not found
Processed AI-generated address string #2996!
Address not found
Address not found
Address not found
Address not found
Address not found
Processed AI-generated address string #2997!
Address not found
Address not found
Address not found
Address not found
Address not found
Processed AI-generated address string #2998!


#### A-3. Geolocate common establishments

Establishments include: malls, stores, restaurants, condominiums, schools, universities etc.

In [None]:
dict_establishments_search = {
    "SM malls": "the Philippines",
    "Robinson's malls": "the Philippines",
    "Ayala malls": "the Philippines",
    "major supermarkets": "Luzon in the Philippines",
    "major supermarkets": "Visayas in the Philippines",
    "major supermarkets": "Mindanao in the Philippines",
    "condominiums": "Metro Manila in the Philippines",
    "major elementary schools": "Luzon in the Philippines",
    "major elementary schools": "Visayas in the Philippines",
    "major elementary schools": "Mindanao in the Philippines",
    "major colleges and universities": "Luzon in the Philippines",
    "major colleges and universities": "Visayas in the Philippines",
    "major colleges and universities": "Mindanao in the Philippines"
}

dict_establishments_search_2 = {
    "local government offices": "in Metro Manila Philippines",
    "famous monuments and tourists spots": "in Metro Manila Philippines",
    "famous restaurants": "in Metro Manila Philippines",
    "stadiums and arenas": "in Metro Manila Philippines",
    "airports, train, and bus stations": "in Metro Manila Philippines",
    "major and famous buildings": "in Metro Manila Philippines",
    "major high schools": "in Metro Manila Philippines"
}

dict_establishments_search_3 = {
    "names of residential subdivisions": "in Metro Manila Philippines",
    "names of residential subdivisions": "in Cavite Philippines",
    "names of residential subdivisions": "in Alabang Philippines",
    "names of residential subdivisions": "in Laguna Philippines",
    "names of residential villages": "in Metro Manila Philippines",
    "names of residential villages": "in Cavite Philippines",
    "names of residential villages": "in Alabang Philippines",
    "names of residential villages": "in Laguna Philippines"
}

In [None]:
df_establishments = pd.DataFrame({
    "category": ["test"],
    "address_raw": ["test"],
    "address_processed": ["test"],
    "address_reversed": ["test"],
    "longitude": [1.234],
    "latitude": [1.234]
})

for key, value in dict_establishments_search_3.items():
    prompt_establishments = f"Generate a numbered list of all the {key} in {value}. Make sure to just list the responses and nothing else. Keep it concise but also informationally complete. Make sure to list everything and do NOT add any other comments in your response."

    if key in ("SM malls", "Robinson's malls", "Ayala malls"):
        iter_queries = 10
    else:
        iter_queries = 3

    for _ in range(iter_queries):
        for attempt in range(5):  # Attempt up to 5 times
            try:
                # Replace the following line with your code that needs to be retried
                print(f"Running iteration {key} {value}; attempt {attempt+1}")
                responses_establishments = generate_list_responses(prompt_establishments, 1)
                responses_establishments = list(set(responses_establishments))

                # Create the DataFrame from multiple lists
                df = pd.DataFrame({
                    "category": len(responses_establishments)*[f"{key} {value}"],
                    "address_raw": responses_establishments,
                    "address_processed": len(responses_establishments)*[np.nan],
                    "address_reversed": len(responses_establishments)*[np.nan],
                    "longitude": len(responses_establishments)*[np.nan],
                    "latitude": len(responses_establishments)*[np.nan]
                })

                df_establishments = pd.concat([df_establishments, df], ignore_index=True)

                # Simulate code that might fail
                if attempt < 4:
                    raise ValueError("Simulated failure")

                # If the code succeeds, break out of the retry loop
                break
            except Exception as e:
                print(f"Attempt {attempt+1} failed with error: {e}")
                if attempt == 4:
                    print(f"Skipping iteration {key} {value} after 5 failed attempts.")

                    # Optionally, log the error or take other actions here
                    continue  # Move to the next iteration of the outer loop

        time.sleep(1)  # Delay in seconds

Running iteration names of residential subdivisions in Laguna Philippines; attempt 1
Attempt 1 failed with error: Simulated failure
Running iteration names of residential subdivisions in Laguna Philippines; attempt 2
Attempt 2 failed with error: Simulated failure
Running iteration names of residential subdivisions in Laguna Philippines; attempt 3
Attempt 3 failed with error: Simulated failure
Running iteration names of residential subdivisions in Laguna Philippines; attempt 4
Attempt 4 failed with error: Simulated failure
Running iteration names of residential subdivisions in Laguna Philippines; attempt 5
Running iteration names of residential subdivisions in Laguna Philippines; attempt 1
Attempt 1 failed with error: Simulated failure
Running iteration names of residential subdivisions in Laguna Philippines; attempt 2
Attempt 2 failed with error: Simulated failure
Running iteration names of residential subdivisions in Laguna Philippines; attempt 3
Attempt 3 failed with error: Simulated

In [None]:
df_establishments

Unnamed: 0,category,address_raw,address_processed,address_reversed,longitude,latitude
0,test,test,test,test,1.234,1.234
1,names of residential subdivisions in Laguna Ph...,Les Jardins Villas,,,,
2,names of residential subdivisions in Laguna Ph...,Sta. Rosa Estates,,,,
3,names of residential subdivisions in Laguna Ph...,Ayala Westgrove Heights,,,,
4,names of residential subdivisions in Laguna Ph...,Brentville International Community,,,,
...,...,...,...,...,...,...
401,names of residential villages in Laguna Philip...,Montecito Nuvali,,,,
402,names of residential villages in Laguna Philip...,Portofino Alabang,,,,
403,names of residential villages in Laguna Philip...,Paseo de Magallanes Village,,,,
404,names of residential villages in Laguna Philip...,Laguna Bel-Air Village,,,,


In [None]:
# First set of establishments
df_establishments = df_establishments.drop_duplicates().reset_index(drop=True)

df_establishments.to_csv(f'/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_establishments.csv', index=False)

In [None]:
# Second set of establishments
df_establishments = df_establishments.drop_duplicates().reset_index(drop=True)

df_establishments.to_csv(f'/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_establishments_2.csv', index=False)

In [None]:
# Third set of establishments
df_establishments = df_establishments.drop_duplicates().reset_index(drop=True)

df_establishments.to_csv(f'/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_establishments_3.csv', index=False)

### B. Geocoding

In [None]:
# Step 3: Initialize iteration counter
iteration = 0
max_iterations = 2000

# Step 4: Use a while loop to iterate through rows with NULL values in the specified column
while iteration < max_iterations:
    df = pd.read_csv("/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_establishments_3.csv")

    # Find the first row with a NULL value in 'ColumnName'
    null_row = df[df['address_processed'].isnull()]

    if null_row.empty:
        break  # Exit loop if there are no more rows with NULL values

    index = null_row.index[0]
    address_string = df.at[index, 'address_raw']

    # Step 5: Apply the custom function
    geo_lat, geo_long, geo_rev_add = get_coordinates(address_string)

    # Update other cells in the row with the result
    df.at[index, 'address_processed'] = address_string
    df.at[index, 'latitude'] = geo_lat
    df.at[index, 'longitude'] = geo_long
    df.at[index, 'address_reversed'] = geo_rev_add

    # Overwrite the CSV file after each iteration
    df.to_csv("/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_establishments_3.csv", index=False)

    # Increment iteration counter
    iteration += 1

Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found
Address not found


In [None]:
df_establishments_1 = pd.read_csv('/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_establishments.csv')
df_establishments_2 = pd.read_csv('/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_establishments_2.csv')
df_establishments_3 = pd.read_csv('/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_establishments_3.csv')

df_establishments = pd.concat([df_establishments_1, df_establishments_2, df_establishments_3], ignore_index=True)

In [None]:
df_establishments.to_csv("/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_establishments_processed.csv", index=False)

### C. Consolidate all addresses

In [None]:
df_1 = pd.read_csv('/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_ai_generated_processed.csv')
df_2 = pd.read_csv("/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_establishments_processed.csv")

df_all = pd.concat([df_1, df_2], ignore_index=True)
df_all = df_all.drop_duplicates().reset_index(drop=True)
df_all = df_all[df_all["category"] != "test"]

# Define the boundaries of the Philippines
min_latitude, max_latitude = 4.5, 21.5
min_longitude, max_longitude = 116.0, 126.0

# Filter the DataFrame
df_all = df_all[
    (df_all['latitude'] >= min_latitude) &
    (df_all['latitude'] <= max_latitude) &
    (df_all['longitude'] >= min_longitude) &
    (df_all['longitude'] <= max_longitude)
]

In [None]:
df_all.to_csv("/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_all_processed.csv", index=False)

In [None]:
df_all = pd.read_csv("/content/drive/MyDrive/AIE/AI 322/Mini Project/addresses_all_processed.csv")

In [None]:
df_all = df_all[(df_all["longitude"] != -998)]
df_all = df_all.drop_duplicates(subset='address_raw', keep='first')

In [None]:
dict_raw_rev = dict(zip(df_all["address_raw"], df_all["address_reversed"]))

def convert_to_dialogue_format(input_json):
    dialogue_list = []

    for question, answer in input_json.items():
        # dialogue = f"### Human: {question} ### Assistant: {answer}"
        dialogue = f"<s>[INST] {question} [/INST] {answer} </s>"
        dialogue_list.append(dialogue)

    return dialogue_list

data_dialogue = convert_to_dialogue_format(dict_raw_rev)
data_dialogue = [re.sub(r'^"|"$', '', s) for s in data_dialogue]

df = pd.DataFrame(data_dialogue)
df = df.rename(columns={0:"text"})

In [None]:
df_dataset = df_all[["address_raw", "address_reversed"]]
df_dataset = df_dataset.rename(columns={"address_raw":"prompt", "address_reversed":"response"})
df_dataset['prompt'] = df_dataset['prompt'].apply(lambda x: "Clean this address string to the correct format: " + x)

Unnamed: 0,prompt,response
0,Clean this address string to the correct forma...,"Taft Avenue, Barangay 678, Barangay 694, Malat..."
2,Clean this address string to the correct forma...,"Jasmine Street, Asian Leaf, San Francisco, Gen..."
3,Clean this address string to the correct forma...,"Ipil Street, Garden Heights, 19-B Garcia Heigh..."
6,Clean this address string to the correct forma...,"Jose P. Rizal Avenue, Bel-Air Village Phase I ..."
7,Clean this address string to the correct forma...,"Alabang-Zapote Road, Filinvest City, Muntinlup..."
...,...,...
1073,Clean this address string to the correct forma...,"Laguna Bel-Air, Pulong Santa Cruz, Santa Rosa,..."
1074,Clean this address string to the correct forma...,"San Lorenzo South Subdivision Phase 1C Annex, ..."
1075,Clean this address string to the correct forma...,"Dasmariñas Village, District I, Makati, Southe..."
1076,Clean this address string to the correct forma...,"Montecito Nuvali, Canlubang, Calamba, Laguna, ..."


In [None]:
# Split the data into train and test sets, with 90% in the train set
train_df = df_dataset.sample(frac=0.9, random_state=42)
test_df = df_dataset.drop(train_df.index)

In [None]:
# Save the dataframes to .jsonl files
train_df.to_json("/content/drive/MyDrive/AIE/AI 322/Mini Project/train.jsonl", orient='records', lines=True)
test_df.to_json("/content/drive/MyDrive/AIE/AI 322/Mini Project/test.jsonl", orient='records', lines=True)

### D. Read all data

In [1]:
import pandas as pd

C:\Users\Rossjyn\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\Rossjyn\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [6]:
df_all_addresses = pd.read_csv("../data/addresses_all_processed.csv")

print(f"The dimensions of the table is {df_all_addresses.shape}.")
df_all_addresses.head()

The dimensions of the table is (1079, 6).


Unnamed: 0,category,address_raw,address_processed,address_reversed,longitude,latitude
0,AI-generated Address,"789 Taft Ave., Barangay Malate, Manila","789 Taft Avenue, Barangay Malate, Manila","Taft Avenue, Barangay 678, Barangay 694, Malat...",120.989656,14.573774
1,AI-generated Address,"789 Taft Ave., Barangay Malate, Manila","789 Taft Avenue, Barangay Malate, Manila.","Taft Avenue, Barangay 678, Barangay 694, Malat...",120.989656,14.573774
2,AI-generated Address,"Blk 7, Lot 23, Jasmine St., Brgy. San Francisc...","Jasmine Street, San Francisco, General Trias, ...","Jasmine Street, Asian Leaf, San Francisco, Gen...",120.927077,14.297408
3,AI-generated Address,"55 Ipil St., Brgy. Poblacion, Davao City","55 Ipil Street, Barangay Poblacion, Davao City.","Ipil Street, Garden Heights, 19-B Garcia Heigh...",125.606685,7.091886
4,AI-generated Address,"55 Ipil St., Brgy. Poblacion, Davao City","55 Ipil Street, Barangay Poblacion, Davao City","Ipil Street, Garden Heights, 19-B Garcia Heigh...",125.606685,7.091886
