### 🚀 Task 1: Load Dataset

In [None]:
import pandas as pd
import json

# Load the character dataset
characters_path = "characters.csv"
df_characters = pd.read_csv(characters_path)

# Display the character dataset
df_characters.head()

print("✅ Characters dataset loaded successfully!")

# Load the episode transcripts
trek_path = "trek.json"
with open(trek_path, "r", encoding="utf-8") as file:
    trek_transcripts = json.load(file)

# Display some transcripts
print("✅ Trek transcripts loaded successfully!")
print(json.dumps(trek_transcripts, indent=2)[:2000])  # Display first part of JSON


### 🚀 Task 2: Preprocessing the Text

In [None]:
import spacy
import re

# Load Spacy Model
nlp = spacy.load("en_core_web_sm")

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"o'brien", "obrien", text)  # Remove apostrophe from O'Brien
    text = re.sub(r"t'pol", "tpol", text)  # Remove apostrophe from T'Pol
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation & special characters
    doc = nlp(text)  # Tokenize & Lemmatize
    words = [token.lemma_ for token in doc if token.is_alpha]
    return words

# Apply preprocessing to all transcripts
processed_transcripts = {show: [preprocess_text(ep) for ep in episodes] for show, episodes in trek_transcripts.items()}

print("✅ Text preprocessing completed!")


### 🚀 Task 3: Train Two Word2Vec Models

In [None]:
from gensim.models import Word2Vec

# Flatten transcripts into a list of sentences
all_sentences = [sentence for episodes in processed_transcripts.values() for episode in episodes for sentence in episode]

# Train Word2Vec with window size = 2
w2v_model_2 = Word2Vec(sentences=all_sentences, vector_size=300, window=2, min_count=1, workers=4)

# Train Word2Vec with window size = 10
w2v_model_10 = Word2Vec(sentences=all_sentences, vector_size=300, window=10, min_count=1, workers=4)

print("✅ Two Word2Vec models trained successfully!")


### 🚀 Task 4: Compute Cosine Similarities for Characters

In [None]:
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
import numpy as np

# Extract character names
character_names = df_characters["name"].str.lower().tolist()

# Function to compute cosine similarity matrix
def compute_similarity_matrix(model, character_list):
    matrix = np.zeros((len(character_list), len(character_list)))
    
    for i, char1 in enumerate(character_list):
        for j, char2 in enumerate(character_list):
            if char1 in model.wv and char2 in model.wv:
                matrix[i, j] = 1 - cosine(model.wv[char1], model.wv[char2])  # Cosine similarity
            else:
                matrix[i, j] = 0  # If character is missing, similarity = 0
    
    return matrix

# Compute similarity matrices for both models
similarity_matrix_2 = compute_similarity_matrix(w2v_model_2, character_names)
similarity_matrix_10 = compute_similarity_matrix(w2v_model_10, character_names)

# Convert to DataFrames
df_similarity_2 = pd.DataFrame(similarity_matrix_2, index=character_names, columns=character_names)
df_similarity_10 = pd.DataFrame(similarity_matrix_10, index=character_names, columns=character_names)

df_similarity_2.head()
df_similarity_10.head()

print("✅ Cosine similarity matrices computed!")


### 🚀 Task 5: Compute Similarities for Roles

In [None]:
# Extract character roles
character_roles = df_characters["role"].str.lower().tolist()

# Compute similarity matrices for roles
similarity_matrix_roles_2 = compute_similarity_matrix(w2v_model_2, character_roles)
similarity_matrix_roles_10 = compute_similarity_matrix(w2v_model_10, character_roles)

# Convert to DataFrames
df_similarity_roles_2 = pd.DataFrame(similarity_matrix_roles_2, index=character_roles, columns=character_roles)
df_similarity_roles_10 = pd.DataFrame(similarity_matrix_roles_10, index=character_roles, columns=character_roles)

df_similarity_roles_2.head()
df_similarity_roles_10.head()

print("✅ Role-based similarity matrices computed!")
