In [None]:
from sentence_transformers import SentenceTransformer
import os
import json
import numpy as np
from google.colab import drive
import torch

In [None]:
drive.mount("/content/drive")

In [None]:
#my unique google drive path:
project_path = "/content/drive/MyDrive/Uni/DS 5983/Final Project/Data/" # Nicholai path
# project_path = "/content/drive/MyDrive/DS5983-FinalProjcet/" # Tyree's path
os.chdir(project_path)

In [None]:
!pwd

In [None]:
!ls

In [None]:
EMBEDDING_MODEL = "sujet-ai/Marsilia-Embeddings-EN-Large" # this was the first model
#EMBEDDING_MODEL = "ProsusAI/finbert" # trained on sentiment classification, so it needed to append mean-pooling layer for sentence transformer
#EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # General purpose, fast
#EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"  # Higher quality
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
model = SentenceTransformer(EMBEDDING_MODEL, device=device)

In [None]:
# switch function for chunking type
def choose_chunking(doc, type="sentence"):
  if type == "sentence":
    return sentence_chunks(doc)
  elif type == "fixed":
    return read_file_in_chunks_fixed(doc)
  elif type == "overlap":
    return read_file_in_chunks_overlap(doc)

# sentence chunking
def sentence_chunks(doc):
  with open(doc, 'r') as file:
    chunks = [line for line in file.readlines() if line.strip()]
  return chunks


# Fixed word chunking
def read_file_in_chunks_fixed(doc, target_words=200):
    with open(doc, 'r') as file:
        text = file.read()

    # array of single words
    words = text.split()
    chunks = []

    # 0 to end of words, step by 'target_words'
    for i in range(0, len(words), target_words):
        chunk = ' '.join(words[i:i + target_words])
        if chunk.strip():  # Only add non-empty chunks
            chunks.append(chunk)

    return chunks

def read_file_in_chunks_overlap(doc, target_words=100, overlap_words=10):
    with open(doc, 'r') as file:
        text = file.read()

    words = text.split()
    chunks = []

    #step of target_words not constant like above: will likely not have full 200 chunk if find period
    i = 0
    while i < len(words):
        # Get chunk of full target-size
        chunk_words = words[i:i + target_words]
        chunk = ' '.join(chunk_words)

        # Try to end at a sentence boundary if possible
        if i + target_words < len(words):
            # Look for sentence end in last 20% of chunk
            last_20_percent_words = int(target_words * 0.2)

            last_portion = ' '.join(chunk_words[-last_20_percent_words:])
            if '.' in last_portion:
                last_period = chunk.rfind('.')
                chunk = chunk[:last_period + 1]
                actual_words = len(chunk.split())
                i += actual_words - overlap_words
            else:
                i += target_words - overlap_words
        else:
            i += target_words

        if chunk.strip():
            chunks.append(chunk.strip())

    return chunks

In [None]:
#placeHold = {"chunk_id":-1, "chunk_text":"temp","embed":[0]}
CHUNK_OPTION = "overlap" # 'sentence', "fixed" or "overlap"
data_dirs = ["FOMC Data", "Company Data/Earnings Calls", "Company Data/Annual Reports"]

fomc_docs = ["FOMC_min_JAN25.txt", "FOMC_min_MARCH25.txt", "FOMC_min_MAY25.txt", "FOMC_min_JUNE25.txt"]
earn_docs = ["Alphabet_GOOGL_Q2_2025_Earnings_Call_Transcript.txt", "Amazon_Q1_MAY2025_Earnings_Call.txt",
             "APPL_EarningsCall_Q1_JAN25.txt", "META-Q1-2025-Earnings-Call-Transcript-1.txt",
             "MSFT_EarningsCall_Q3_APRIL25.txt"]
annual_docs = ["microsoft_2024_structured.txt", "meta_2024_structured.txt", "google_2024_structured.txt",
               "amazon_2024_structured.txt","apple_2024_structured.txt"] #"AAPL_10-k_24.txt" IT is a google doc still
jsonDict = {}
for dir in data_dirs:
  os.chdir(dir)

  if dir == "FOMC Data":
    docs = fomc_docs
  elif dir == "Company Data/Earnings Calls":
    docs = earn_docs
  elif dir == "Company Data/Annual Reports":
    docs = annual_docs
  else:
    print("ERROR WITH DIRS PASSED")
    break

  for doc in docs:
    # simple sentence seperation
    # with open(doc, 'r') as file:
    #   dataset = [line for line in file.readlines() if line.strip()]
    dataset = choose_chunking(doc, type=CHUNK_OPTION)

    # embed each line (each line gets seperate embedding)
    embeddings = model.encode(dataset, batch_size=32, normalize_embeddings=True)

    jsonDict[doc] = [] # key is doc name, value is list of chunks

    for i, (text, embedding) in enumerate(zip(dataset, embeddings)):
      jsonDict[doc].append({
          "chunk_id": i,
          "chunk_text": text,
          "embed": embedding.tolist()
      })

    print("Document:", doc)
    print(f'Loaded {len(jsonDict[doc])} entries')
  # reset directory back to base
  os.chdir(project_path)



In [None]:
# Save the Vector Database
with open("VectorDB_Mar_100_Window.json", "w") as file:
    json.dump(jsonDict, file)

In [None]:
# sample of how to access
jsonDict['FOMC_min_JAN25.txt'][32]

**Examining Average Length of Chunk**

In [None]:
path = '/content/drive/MyDrive/Uni/DS 5983/Final Project/Data/VectorDB.json' # nicholai
# path = '/content/drive/MyDrive/Final Project/Data/VectorDB.json' # shreya
# path = '/content/drive/MyDrive/DS5983-FinalProjcet/VectorDB.json' # tyree

with open(path, "r") as file:
    Vector_db = json.load(file)

In [None]:
flat_chunks = []

# Or don't load (analyze immediatly after):
Vector_db = jsonDict
for doc_name, chunks in Vector_db.items():
    for chunk in chunks:
        flat_chunks.append(chunk['chunk_text'])


In [None]:
word_counts = [len(chunk.split()) for chunk in flat_chunks]

# Calculate average
average_words = sum(word_counts) / len(word_counts) if word_counts else 0

print(f"Average words per chunk: {average_words:.2f}")
print(f"Total chunks: {len(flat_chunks)}")
print(f"Min words: {min(word_counts) if word_counts else 0}")
print(f"Max words: {max(word_counts) if word_counts else 0}")

In [None]:
max_index = word_counts.index(max(word_counts))

# Get the chunk with max words
max_chunk = flat_chunks[max_index]
max_chunk

In [None]:
# notice any overlap (default is 10 words) (for overlap chunking style)
for i in range(5):
  print(flat_chunks[i], "\n" )