##### Step 1: Convert CSV to JSON Files

In [1]:
import csv
import json
import uuid
import os

raw_reviews_file = "../data/raw/hotel_reviews_1000.csv"
transformed_dir = "../data/transformed"

raw_reviews = open(raw_reviews_file, "r").readlines()

if not os.path.exists(transformed_dir):
    os.makedirs(transformed_dir)

def process_reviews(file_path):
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        # Read the first line to get the header
        header = next(csv.reader(csvfile))
        
        # Create a mapping of expected column names to actual column names
        column_mapping = {
            'dateAdded': 'dateAdded',
            'city': 'city',
            'hotel_name': 'name',
            'hotel_state': 'province',
            'review_text': 'reviews.text',
            'review_title': 'reviews.title'
        }
        
        # Find the index of each required column
        column_indices = {}
        for expected_name, actual_name in column_mapping.items():
            try:
                column_indices[expected_name] = header.index(actual_name)
            except ValueError:
                print(f"Warning: Column '{actual_name}' not found in the CSV. Some data may be missing.")
        
        # Reset file pointer to the beginning
        csvfile.seek(0)
        
        # Skip the header row
        next(csvfile)
        
        # Use csv.reader instead of DictReader
        reader = csv.reader(csvfile)
        
        for i, row in enumerate(reader, start=1):
            review_json = {}
            for key, index in column_indices.items():
                if index < len(row):
                    review_json[key] = row[index]
                else:
                    review_json[key] = ""  # or None, depending on your preference
            
            # Generate a unique identifier
            review_json['id'] = str(uuid.uuid4())
            
            # print(json.dumps(review_json, indent=2))
            print(f"processed record [{i}] with id [{review_json['id']}]")

            with open(f"{transformed_dir}/review_{i}.json", "w+") as f:
                json.dump(review_json, f, indent=2)
            
process_reviews(raw_reviews_file)

processed record [1] with id [65f1b35a-adda-4abe-b0ea-0c05e7c203d8]
processed record [2] with id [943f0053-b787-432c-833a-7337a5d5a773]
processed record [3] with id [e1549e85-8716-4f25-8bf2-8260a57ee80b]
processed record [4] with id [24c48819-7719-4fc2-a901-8715060a7e52]
processed record [5] with id [429471a4-9b33-49a9-a85d-ffaf955c74a7]
processed record [6] with id [98bfa8aa-622c-46a8-a4bc-4071d546fea3]
processed record [7] with id [6b3df75d-38df-48f6-a5de-204ad96f5e85]
processed record [8] with id [6a671351-e86d-4fac-ba47-7f42810e7e51]
processed record [9] with id [6ebd3c9f-6792-4b78-8dd0-9733db6ef92d]
processed record [10] with id [e8d36362-d044-416d-865b-cd0decf127e4]
processed record [11] with id [e5c4037d-4e24-4418-826b-60d5b81871f8]
processed record [12] with id [3cdc66dc-bcf8-4294-beed-4f7e94448626]
processed record [13] with id [5fe5efd1-0367-4dc2-b4d0-e07941c86fd0]
processed record [14] with id [74068c55-00cb-46e1-bdb0-9372db9b931d]
processed record [15] with id [2dee37bb-bd0

#### Step 2: Create Embeddings for each of the JSON Files

In [2]:
%pip install -q python-dotenv openai

Note: you may need to restart the kernel to use updated packages.


In [3]:
from openai import OpenAI
client = OpenAI()
completion = client.chat.completions.create(
    model="gpt-4o",
    store=True,
    messages=[
        {"role": "user", "content": "write a haiku about ai"}
    ]
)



In [4]:
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

client = OpenAI()

response = client.embeddings.create(
    input="Hello world",
    model="text-embedding-3-small"
)

print(len(response.data[0].embedding))
print(response.data[0].embedding)

1536
[-0.0020785425, -0.049085874, 0.02094679, 0.031351026, -0.045305308, -0.026402483, -0.028999701, 0.060304623, -0.025710916, -0.014822582, 0.015444992, -0.029983262, -0.020393535, -0.03334889, 0.025833862, 0.014207856, -0.07007877, 0.012432834, 0.014791845, 0.048839983, 0.020731635, -0.008890475, -0.015114577, -0.016612971, 0.02592607, -0.0029026596, -0.024327783, 0.024281679, 0.0017433246, -0.055724915, 0.023082962, -0.04548973, -0.008652269, 0.003161997, 0.004583551, 0.0017942316, 0.026694478, 0.010158348, -0.012056314, -0.011472325, -0.01491479, -0.023129066, 0.02535745, 0.036822088, -0.03550043, 0.02126952, -0.06307089, 0.0403875, 0.053542636, 0.061534077, -0.03365625, -0.0066582514, 0.025495762, 0.10966712, -0.004683444, -0.039465412, 0.007119296, 0.05151404, -0.026325643, 0.027877826, 0.030428939, 0.020593323, 0.017243065, 0.0123559935, 0.0010844151, 0.007092402, -0.03706798, 0.02352864, -0.010665497, 0.040848546, -0.002132331, 0.031366397, -0.04272346, -0.0067581446, -0.0472

In [5]:
import os
import json

transformed_dir = "../data/transformed"
embedded_dir = "../data/embedded"

if not os.path.exists(embedded_dir):
    os.makedirs(embedded_dir)
    
def prepare_embedding_str(review_json):
    return f"REVIEW_TITLE: {review_json['review_title']} REVIEW_TEXT: {review_json['review_text']} HOTEL_NAME: {review_json['hotel_name']} HOTEL_CITY: {review_json['city']} HOTEL_STATE: {review_json['hotel_state']}"
    
client = OpenAI()
for file in os.listdir(transformed_dir):
    with open(f"{transformed_dir}/{file}", "r") as f:
        review = json.load(f)
        
        ## start here
        embedding_str = prepare_embedding_str(review)
        response = client.embeddings.create(
            input=embedding_str,
            model="text-embedding-3-small"
        )
        
        review['embedding'] = response.data[0].embedding
        
        with open(f"{embedded_dir}/{file}", "w") as f:
            json.dump(review, f, indent=2)