In [None]:
import os
import json
import numpy as np
import pickle
import pandas as pd
from typing import List, Dict, Any
from tqdm import tqdm
from google.colab import drive

###############################################################################
# CONFIGURE GOOGLE DRIVE MOUNTING AND SPACY SETUP
###############################################################################

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Download and load spaCy model
import spacy.cli
spacy.cli.download("en_core_web_lg")
spacy.require_gpu()
nlp = spacy.load("en_core_web_lg")

###############################################################################
# CHECKPOINT RECONSTRUCTION FROM PREVIOUSLY PROCESSED DATA
###############################################################################

# Set paths for output and checkpoint files
output_file = "/content/drive/MyDrive/COPENHAGEN/MSC THESIS/datasets/essays/full_chunked_local_minima.json"
checkpoint_file = "/content/drive/MyDrive/COPENHAGEN/MSC THESIS/datasets/essays/processed_authids.txt"

# Reconstruct processed AUTHIDs from output file
processed_authids = set()
with open(output_file, 'r') as f:
    for line in f:
        if line.strip():
            chunk = json.loads(line)
            processed_authids.add(chunk['#AUTHID'])

# Save reconstructed AUTHIDs to checkpoint file
with open(checkpoint_file, 'w') as cf:
    for auth_id in processed_authids:
        cf.write(auth_id + '\n')

print("Checkpoint reconstructed. Processed AUTHIDs have been saved to processed_authids.txt.")

###############################################################################
# LOAD HUGGING FACE MODEL AND TOKENIZER FOR EMBEDDING EXTRACTION
###############################################################################

import torch
from transformers import AutoTokenizer, AutoModel

# Load Hugging Face model and tokenizer
model_name = "dunzhang/stella_en_1.5B_v5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Add sentence segmentation pipeline to spaCy
nlp.add_pipe('sentencizer')

###############################################################################
# EMBEDDING AND SIMILARITY FUNCTIONS
###############################################################################

def get_batch_embeddings(sentences: List[str], batch_size: int = 16) -> List[np.ndarray]:
    """
    Computes embeddings for a batch of sentences using the preloaded Hugging Face model.
    """
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """
    Computes cosine similarity between two vectors.
    """
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def find_boundaries(similarities: List[float]) -> List[int]:
    """
    Identifies local minima in the similarity scores to determine chunk boundaries.
    """
    boundaries = []
    for i in range(1, len(similarities) - 1):
        if similarities[i] < similarities[i - 1] and similarities[i] < similarities[i + 1]:
            boundaries.append(i + 1)
    return boundaries

###############################################################################
# PASS 1: PROCESSING FUNCTION TO SPLIT TEXT INTO CHUNKS BASED ON LOCAL MINIMA
###############################################################################

def process_row(row: pd.Series) -> List[Dict[str, Any]]:
    """
    Processes a single row of the input DataFrame, splitting the text into chunks based on semantic similarity.
    """
    text = row['TEXT']
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    # Handle texts with one or no sentences
    if len(sentences) <= 1:
        return [{
            'Chunk Number': 1,
            '#AUTHID': row['#AUTHID'],
            'TEXT': " ".join(sentences),
            'chunking_method': 'semantic_similarity_local_minima',
            'model': 'stella-1.5B',
            'run_time': time.time() - start_time,
            'cEXT': row['cEXT'],
            'cNEU': row['cNEU'],
            'cAGR': row['cAGR'],
            'cCON': row['cCON'],
            'cOPN': row['cOPN'],
        }]

    # Generate embeddings in batches
    embeddings = get_batch_embeddings(sentences)

    # Compute similarity between consecutive sentences
    similarities = [cosine_similarity(embeddings[i], embeddings[i + 1]) for i in range(len(embeddings) - 1)]
    boundaries = find_boundaries(similarities)

    # Create chunks
    chunk_start_indices = [0] + boundaries + [len(sentences)]
    chunks = []
    for i in range(len(chunk_start_indices) - 1):
        start_idx = chunk_start_indices[i]
        end_idx = chunk_start_indices[i + 1]
        chunk_sentences = sentences[start_idx:end_idx]
        chunk_text = " ".join(chunk_sentences)
        chunks.append({
            'Chunk Number': i + 1,
            '#AUTHID': row['#AUTHID'],
            'TEXT': chunk_text,
            'chunking_method': 'semantic_similarity_local_minima',
            'model': 'stella-1.5B',
            'run_time': time.time() - start_time,
            'cEXT': row['cEXT'],
            'cNEU': row['cNEU'],
            'cAGR': row['cAGR'],
            'cCON': row['cCON'],
            'cOPN': row['cOPN'],
        })
    return chunks

###############################################################################
# PASS 2: PROCESSING AND MERGING CHUNKS BASED ON COSINE SIMILARITY
###############################################################################

def process_and_merge_chunks(input_path: str, output_path: str, merge_threshold: float = 0.4):
    """
    Processes and merges chunks based on semantic similarity.
    """
    df = pd.read_csv(input_path, encoding="latin1")
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as cf:
            completed_authids = {line.strip() for line in cf if line.strip()}
    else:
        completed_authids = set()

    remaining_rows = df[~df['#AUTHID'].isin(completed_authids)]

    with ThreadPoolExecutor(max_workers=2) as executor:
        for row_chunks in tqdm(executor.map(process_row, [row for _, row in remaining_rows.iterrows()]),
                               total=len(remaining_rows), desc="Processing rows"):
            with open(output_path, 'a') as f:
                for chunk in row_chunks:
                    f.write(json.dumps(chunk) + '\n')
            current_authid = row_chunks[0]['#AUTHID']
            with open(checkpoint_file, 'a') as cf:
                cf.write(current_authid + '\n')

    print("Chunks based on semantic content saved successfully!")