In [None]:
import pandas as pd

# Load the CSV files
df1 = pd.read_csv('Resale Flat Prices (Based on Registration Date), From Jan 2015 to Dec 2016.csv')
df2 = pd.read_csv('Resale Flat Prices (Based on Registration Date), From Mar 2012 to Dec 2014.csv')
df3 = pd.read_csv('Resale flat prices based on registration date from Jan-2017 onwards.csv')
df4 = pd.read_csv('Resale Flat Prices (Based on Approval Date), 1990 - 1999.csv')
df5 = pd.read_csv('Resale Flat Prices (Based on Approval Date), 2000 - Feb 2012.csv')

# Merge the dataframes into one (assuming they have the same structure)
df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

# Ensure the 'month' or equivalent date column is in datetime format
df['month'] = pd.to_datetime(df['month'], errors='coerce')

# Save the merged dataframe to a new file
df.to_csv('merged_hdb_resale_prices.csv', index=False)

# Display the first few rows to confirm
df.head()


In [None]:
def create_summaries(df):
    # Aggregate by town and flat_type
    agg_df = df.groupby(['town', 'flat_type']).agg({
        'resale_price': ['mean', 'min', 'max', 'count'],
        'floor_area_sqm': ['mean', 'min', 'max'],
        'month': 'max'  # Most recent transaction
    }).reset_index()

    # Flatten column names
    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]

    # Create summary texts
    summaries = []
    for _, row in agg_df.iterrows():
        summary = f"In {row['town_']} for {row['flat_type_']} flats: "
        summary += f"Average price: ${row['resale_price_mean']:,.2f}, "
        summary += f"Price range: ${row['resale_price_min']:,.2f} to ${row['resale_price_max']:,.2f}, "
        summary += f"Average size: {row['floor_area_sqm_mean']:.1f} sqm, "
        summary += f"Size range: {row['floor_area_sqm_min']:.1f} to {row['floor_area_sqm_max']:.1f} sqm, "
        summary += f"Based on {row['resale_price_count']} transactions up to {row['month_max'].strftime('%B %Y')}."
        summaries.append(summary)

    return summaries

summaries = create_summaries(df)


In [7]:
import pandas as pd
df = pd.read_csv("df_summary.csv")
df = df['summary']
df

0      In ANG MO KIO for 1 ROOM flats: Average price:...
1      In ANG MO KIO for 2 ROOM flats: Average price:...
2      In ANG MO KIO for 3 ROOM flats: Average price:...
3      In ANG MO KIO for 4 ROOM flats: Average price:...
4      In ANG MO KIO for 5 ROOM flats: Average price:...
                             ...                        
132    In YISHUN for 4 ROOM flats: Average price: $26...
133    In YISHUN for 5 ROOM flats: Average price: $37...
134    In YISHUN for EXECUTIVE flats: Average price: ...
135    In YISHUN for MULTI GENERATION flats: Average ...
136    In YISHUN for MULTI-GENERATION flats: Average ...
Name: summary, Length: 137, dtype: object

In [None]:
# pip install chromadb langchain langchain-openai langchain-chroma
import chromadb
from chromadb.utils.embedding_functions import create_langchain_embedding
from langchain_openai import OpenAIEmbeddings
import os
import pandas as pd

langchain_embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=os.getenv("OPENAI_API_KEY"),
)
ef = create_langchain_embedding(langchain_embeddings)
client = chromadb.PersistentClient(path="/chroma-data")
collection = client.get_or_create_collection(name="my_collection", embedding_function=ef)

df = pd.read_csv("df_summary.csv")
df = df['summary']

for i in range(len(df)):
    collection.add(ids=[str(i)], documents=[df[i]])



In [28]:
# Retrieve the embeddings
import numpy as np
import pandas as pd
from chromadb.utils.embedding_functions import create_langchain_embedding
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import os

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=os.getenv("OPENAI_API_KEY"),
)

vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=embeddings,
    persist_directory="/chroma-data",  # Where to save data locally, remove if not necessary
)

results = vector_store.similarity_search_with_score(
    "ANG MO KIO with good food",
    k=2,
)
for res, score in results:
    print(f"* {res.page_content} (score: {score:3f})")



* In ANG MO KIO for 5 ROOM flats: Average price: $486,224.27, Price range: $78,300.00 to $1,300,000.00, Average size: 120.5 sqm, Size range: 110.0 to 150.0 sqm, Based on 5275 transactions up to October 2024. (score: 1.100592)
* In ANG MO KIO for 4 ROOM flats: Average price: $322,317.46, Price range: $47,000.00 to $1,080,000.00, Average size: 93.2 sqm, Size range: 81.0 to 114.0 sqm, Based on 12889 transactions up to October 2024. (score: 1.100861)
