# SQuAD

## Data Preprocessing

In [None]:
import pandas as pd
import json

squad_path = 'datasets/squad.json'

# Load the JSON file
with open(squad_path, 'r') as f:
    data = json.load(f)
    # data = data.drop(columns=['version'])

rows = []
for entry in data["data"]:
    title = entry["title"]
    for paragraph in entry["paragraphs"]:
        rows.append({"title":title, "data":paragraph["context"]})    

df = pd.DataFrame(rows)

print(df.head())

## Segement Sentences

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import numpy as np

In [None]:
# download nltk
nltk.download('punkt')

In [None]:
# load dataset
def load_data(file_path):
    """Load ROCStories dataset."""
    df = pd.read_csv(file_path)
    return df

# clean and normalize text
def clean_text(text):
    """Clean and normalize text."""
    text = re.sub(r'\s+', ' ', text)  # Remove excessive whitespace
    text = text.strip()  # Remove leading/trailing whitespace
    text = re.sub(r'[^\w\s\.\,\']', '', text)  # Remove special characters
    return text

# tokenization and processing
def process(df):
    """Process stories into a tokenized and cleaned format."""
    processed = []
    for _, row in df.iterrows():
        # story_id = row['storyid']
        # story_title = clean_text(row['storytitle'])
        title = row['title']
        text = row['data']
        sentences = [clean_text(sentence) for sentence in text.split('.')]
        processed.append({'Title': title, 'Sentences': sentences})
    return processed

# generate embeddings
def generate_embeddings(processed, model_name='all-MiniLM-L6-v2'):
    """Generate sentence embeddings using Sentence Transformers."""
    model = SentenceTransformer(model_name)
    embeddings = []
    metadata = []

    for entry in processed:
        title = entry['Title']
        sentences = entry['Sentences']
        for i, sentence in enumerate(sentences):
            embedding = model.encode(sentence, convert_to_tensor=True).cpu().numpy()
            embeddings.append(embedding)
            metadata.append({
                'Title': title,
                'SentenceIndex': i,
                'Sentence': sentence
            })

    embeddings = np.array(embeddings)
    return embeddings, metadata

# store embeddings in FAISS
def build_faiss_index(embeddings):
    """Build and store FAISS index for retrieval."""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# save preprocessed data
def save_preprocessed_data(metadata, index, index_file='faiss_index'):
    """Save metadata and FAISS index."""
    # save metadata
    pd.DataFrame(metadata).to_csv('metadata.csv', index=False)
    # save FAISS index
    faiss.write_index(index, index_file)

# main preprocessing pipeline
def preprocess_pipeline(df):
    """Complete preprocessing pipeline for ROCStories dataset."""
    print("Loading data...")
    # df = load_data(file_path)

    print("Processing and cleaning stories...")
    processed = process(df)

    print("Generating embeddings...")
    embeddings, metadata = generate_embeddings(processed)

    print("Building FAISS index...")
    index = build_faiss_index(embeddings)

    print("Saving preprocessed data...")
    save_preprocessed_data(metadata, index, index_file='faiss_index')  # pass FAISS index
    print("Preprocessing complete!")

if __name__ == '__main__':
    # file_path = 'datasets\ROCStories_winter2017.csv'
    preprocess_pipeline(df)

In [None]:
processed_data = pd.read_csv('metadata.csv')
processed_data.head()

In [None]:
import numpy as np

np.save('embeddings.npy', index)

In [None]:
# load the FAISS index
index = faiss.read_index('faiss_index')

# load metadata
metadata = pd.read_csv('metadata.csv')

In [None]:
from sentence_transformers import SentenceTransformer

# load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# example query
query = "It's so good to be alive"

# generate query embedding
query_embedding = model.encode(query, convert_to_tensor=True).cpu().numpy()

In [None]:
# number of closest matches to retrieve
top_k = 10

# search
distances, indices = index.search(query_embedding.reshape(1, -1), top_k)

for i, idx in enumerate(indices[0]):
    print(f"Result {i + 1}:")
    print(f"Sentence: {metadata.iloc[idx]['Sentence']}")
    print(f"Story Title: {metadata.iloc[idx]['StoryTitle']}")
    print(f"Distance: {distances[0][i]}")
    print()