In [None]:
import os
import sqlite3 as sql
from typedstream.stream import TypedStreamReader

In [None]:
YOUR_NAME = 'YOUR_NAME'
OTHER_NAME = 'OTHER_NAME'
TARGET_PHONE_NUMBER = {'(111) 111-1111', '+1 (111) 111-1111', '+11111111111', '11111111111'}
USERNAME = 'macos-username'
databasePath = f'/Users/{USERNAME}/Library/Messages/chat.db'

In [None]:
def decodeAttributedBody(data):
    if not data:
        return None
    for event in TypedStreamReader.from_data(data):
        # The first bytes object is the one we want
        if type(event) is bytes:
            return event.decode("utf-8")

In [None]:
# Connect to the iMessage database
dbConnection = sql.connect(databasePath)
cursor = dbConnection.cursor()

# SQL Query to extract relevant message data
query = """
SELECT
    message.ROWID,
    message.date,
    message.text,
    handle.id AS phoneNumber,
    message.is_from_me,
    message.attributedBody
FROM
    message
LEFT JOIN
    handle ON message.handle_id = handle.ROWID

"""

In [None]:
# Fetch messages
results = cursor.execute(query).fetchall()
results.sort(key=lambda result: result[1]) # Sort by date (chronological order)

In [None]:
# Parse and filter messages
data = []

for result in results:
    rowid, date, text, phoneNumber, isFromMe, attributedBody = result

    # Filter out irrelevant phone numbers or messages without a date
    if phoneNumber not in TARGET_PHONE_NUMBER or not date:
        continue

    # Decode attributedBody if text is empty or if attributedBody is present
    if attributedBody and not text:
        text = decodeAttributedBody(attributedBody)

    if not text:
        continue  # Skip messages with no text or decoded attributedBody

    # Determine sender name
    senderName = YOUR_NAME if isFromMe else OTHER_NAME

    # Append the message data
    data.append({"sender": senderName, "text": text, "timestamp": date})

print(f"Processed {len(data)} messages from the database.")

In [None]:
# Only use last 100,000 messages from data
reducedData = data[-100_000:]

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from config import API_KEY  # Replace with your OpenAI API Key configuration

In [None]:
# Combine messages into a single dataset
data_text = "\n".join([f"{msg['sender']}: {msg['text']}" for msg in reducedData])

# # Text splitting with overlap for embedding
BORDER_TEXT = '=============================='
textSplitter = RecursiveCharacterTextSplitter(separators=[f"\n{BORDER_TEXT}\n", "\n"], chunk_size=3000, chunk_overlap=250)
documents = textSplitter.create_documents([data_text])

In [None]:
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=API_KEY)
vectors = FAISS.from_documents(documents, embeddings)

# Save FAISS index
vectors.save_local("faiss_index")
print("FAISS vector store saved locally.")

In [None]:
# Load FAISS index
vector_store = FAISS.load_local("faiss_index", embeddings)

# Initialize the LLM for querying and summarization
llm = ChatOpenAI(temperature=0, openai_api_key=API_KEY, model="gpt-4")
qa_chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
# Querying function
def query_conversations(query):
    print(f"Query: {query}")
    docs = vector_store.similarity_search(query, k=5)
    response = qa_chain.run({"input_documents": docs, "question": query})
    return response

In [None]:
# Example usage
query = "Summarize the most common topics"

response = query_conversations(query)
print("\nQuery Response:\n", response)

# Save summarized responses for record-keeping
with open("query_results.txt", "a") as f:
    f.write(f"Query: {query}\nResponse: {response}\n\n")