# Collecting iMessage Data From Your Macbook

In [4]:
import sqlite3 as sql
import os

In [35]:
YOUR_NAME = 'John'
OTHER_NAME = 'Bob'
TARGET_PHONE_NUMBER = {'(000) 111-2222', '+1 (000) 111-2222', '+10001112222', '0001112222'} # To be safe, we should be able to accept all phone number formats

In [6]:
USERNAME = 'YOUR-MACOS-USERNAME'
databasePath = F'/Users/{USERNAME}/Library/Messages/chat.db'

In [58]:
# Connects to database with iMsg data
dbConnection = sql.connect(databasePath)
cursor = dbConnection.cursor()

# SQL Query to merge message data and sender data (handle) for all messages
query = """
    SELECT message.ROWID, message.date, message.text, handle.id, message.is_from_me
    FROM message
    LEFT JOIN handle ON message.handle_id = handle.ROWID
    """

# List of tuples containing all information of each message sent/received
# [ (rowid, date, text, phoneNumber, isFromMe) ]
results = cursor.execute(query).fetchall()

# Sorts results in ascending order of dates
results.sort(key=lambda result: result[1])

totalMessages = len(results)
print(F"You have {totalMessages} messages stored on your local database")

You have 360096 messages stored on your local database


In [270]:
# Initializes empty string for dataset (We will add all the relevant
dataset = ''
numMessages = numConvos = prevMsgTimestamp = 0
BORDER_TEXT = '=============================='

# Iterates through all results (Loop will take a long time depending on how many messages you have)
for result in results:
    # Retrieves information from results
    rowid, date, text, phoneNumber, isFromMe = result
    
    # Skips all messages not in target conversation
    if phoneNumber not in TARGET_PHONE_NUMBER: continue
    
    # Skips all messages with no textBody
    if not text or not date: continue
    
    # Decodes result to figure out sender name
    senderName = YOUR_NAME if isFromMe else OTHER_NAME
    
    # Adds border between conversations (conversations are split when there is at least an hour between texts)
    if date - prevMsgTimestamp > 3.6e+12:
        dataset += F"{BORDER_TEXT}\n\n"
        numConvos += 1

    prevMsgTimestamp = date
    
    dataset += F"{senderName}:\n{text}\n\n"
    numMessages += 1

print(F"Recovered {numMessages} total text messages in your {numConvos} conversations with {OTHER_NAME}")

Recovered 178856 total text messages in your 1766 conversations with Angela


In [119]:
# Store dataset in dataset.txt
with open('dataset.txt', 'w') as f:
    f.write(dataset)

In [254]:
# Splits dataset into list of conversations
conversations = dataset.split(F"{BORDER_TEXT}\n\n")[1:]
len(conversations)

1225

In [255]:
# Creates a miniDataset with the first 100 conversations of dataset 
startConvo, endConvo = 0, 200
miniDataset = F"{BORDER_TEXT}\n\n".join(conversations[startConvo: endConvo])

# Processing Data with Langchain

In [134]:
!pip install langchain



In [256]:
from langchain import PromptTemplate
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.summarize import load_summarize_chain
from sklearn.cluster import KMeans
import numpy as np
import numpy.linalg as npla

# config.py has API_KEY
from config import API_KEY

In [257]:
# Getting GPT's vector representations (embeddings) for words
embeddings = OpenAIEmbeddings(openai_api_key=API_KEY)

In [258]:
# Splits dataset into documents of length <= 3000 characters with 250 character overlap
textSplitter = RecursiveCharacterTextSplitter(separators=[F"\n{BORDER_TEXT}\n", "\n"], chunk_size=3000, chunk_overlap=250)
documents = textSplitter.create_documents([miniDataset])

In [259]:
# Converts each document into number vectors
vectors = embeddings.embed_documents([doc.page_content for doc in documents])

In [260]:
# Number of conversations in dataset
numConvos = endConvo - startConvo

# Each cluster will be a "summary" (PLAY AROUND WITH THIS NUMBER)
numClusters = numConvos // 10

In [271]:
# Uses KMeans method to form cluster groups for each document
clusters = KMeans(n_init = 100, n_clusters = numClusters).fit(vectors)

# List of centers of all "numClusters" clusters
# Will be a list of 10 vectors in this case
clusterCenters = clusters.cluster_centers_
print(clusterCenters)

[[ 0.01359346  0.00032765  0.00971249 ... -0.00530306 -0.01771214
  -0.02150367]
 [-0.00635988 -0.01015473  0.01597559 ... -0.00918797 -0.01277389
  -0.02579569]
 [ 0.00251562 -0.00299122  0.00848585 ... -0.00149442 -0.00476005
  -0.032518  ]
 ...
 [-0.01446307 -0.00310797  0.00221868 ... -0.00747178 -0.00669437
  -0.02900437]
 [-0.00143609 -0.01169571  0.01272634 ... -0.01112262 -0.00924455
  -0.01925051]
 [-0.01194298 -0.00813089  0.00265234 ... -0.01238807 -0.00812238
  -0.0242796 ]]


In [272]:
# Pick 10 vectors that are respectively the closest to each cluster center
closestVectors = []
for center in clusterCenters:
    # List of distances between each vector and the current center vector
    distances = npla.norm(vectors - center, axis = 1)
    
    # Finds index corresponding to closest of those vectors and adds to our list
    closestVector = np.argmin(distances)
    closestVectors.append(closestVector)

print(closestVectors)

# Collect the 10 documents that correspond to these vector indices
# We will use these 10 documents as a representative sample for the entire dataset
closestVectors.sort()
selectedDocs = [ documents[idx] for idx in closestVectors ]

[496, 287, 427, 604, 656, 566, 432, 679, 501, 151, 419, 533, 328, 392, 531, 343, 464, 180, 227, 614]


In [268]:
summaryPrompt = """
Write a 80-100 word summary of the following:
"{text}"
CONCISE SUMMARY:
"""
summaryPromptTemplate = PromptTemplate(template=summaryPrompt, input_variables=["text"])

In [269]:
# Initializing GPT-3.5-turbo LLM to use for summarizing conversations
llm = ChatOpenAI(temperature=0, openai_api_key=API_KEY, max_tokens=2000, model='gpt-3.5-turbo')
summarizeChain = load_summarize_chain(llm=llm, chain_type="stuff", prompt=summaryPromptTemplate, verbose = False) # Set verbose to True to see Langchain's thinking process

In [274]:
summaries = []
for doc in selectedDocs:
    summary = summarizeChain.run([doc])
    summaries.append(summary)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..


KeyboardInterrupt: 

In [None]:
summary = '\n\n'.join(summaries)

: 

In [None]:
print(summary)

: 