In [1]:
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pathlib import Path

In [2]:
INPUT = Path(r"C:\Users\sanja\Documents\4th year\7sem\Natural Language Processing\Assignments\Individual\Implementation\character-network-dialogue-sentiment\data\friends.csv")
OUTPUT = Path(r"C:\Users\sanja\Documents\4th year\7sem\Natural Language Processing\Assignments\Individual\Implementation\character-network-dialogue-sentiment\data\friends_preprocessed.csv")

In [3]:
nlp = spacy.load("en_core_web_sm", disable=["parser"])



In [4]:
# NLTK downloads
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sanja\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [6]:
emotion_words = {
    "oh", "hey", "what", "wow", "ugh", "oops",
    "nah", "hmm", "huh", "ah", "no", "yes"
}

In [7]:
def clean_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""

    text = text.lower().strip()
    text = text.replace("’", "'")

    # Expand contractions
    contractions = {
        "n't": " not", "'re": " are", "'s": " is",
        "'ll": " will", "'ve": " have", "'d": " would"
    }
    for c, r in contractions.items():
        text = text.replace(c, r)

    # Remove emojis / symbols, keep emotional punctuation
    text = re.sub(r"[^a-z!?'\s]", " ", text)

    text = re.sub(r"\s+", " ", text).strip()

    tokens = nltk.word_tokenize(text)

    processed = []
    for tok in tokens:
        if tok in emotion_words:
            processed.append(tok)
            continue
        if tok in stop_words or len(tok) <= 1:
            continue
        processed.append(lemmatizer.lemmatize(tok))

    return " ".join(processed)

In [8]:
def extract_spacy_features(text):
    if not isinstance(text, str) or not text.strip():
        return [], []
    doc = nlp(text)
    pos = [(t.text, t.pos_) for t in doc]
    ents = [(e.text, e.label_) for e in doc.ents]
    return pos, ents

In [9]:
df = pd.read_csv(INPUT)
df.head()

Unnamed: 0,speaker,utterance,emotion,annotation
0,Phoebe,"Oh my God, he's lost it. He's totally lost it.",non-neutral,2120
1,Monica,What?,surprise,1000130
2,Ross,"Or! Or, we could go to the bank, close our acc...",neutral,3000200
3,Chandler,You're a genius!,joy,500000
4,Joey,"Aww, man, now we won't be bank buddies!",sadness,40100


In [10]:
if "dialogue_id" not in df.columns:
    print("dialogue_id missing → generating dialogue groups...")
    df["dialogue_id"] = 0  # everything is 1 long dialogue
    df["dialogue_id"] = df.index // 1000  # split every 1000 lines as one dialogue

if "turn_id" not in df.columns:
    print("turn_id missing → generating turn IDs...")
    df["turn_id"] = df.groupby("dialogue_id").cumcount()

dialogue_id missing → generating dialogue groups...
turn_id missing → generating turn IDs...


In [11]:
df["utterance_clean"] = df["utterance"].astype(str).apply(clean_text)

In [12]:
pos_list = []
ent_list = []

In [13]:
for text in df["utterance"].astype(str):
    pos, ents = extract_spacy_features(text)
    pos_list.append(pos)
    ent_list.append(ents)

In [14]:
df["pos_tags"] = pos_list
df["entities"] = ent_list

In [15]:
df.to_csv(OUTPUT, index=False, encoding="utf-8-sig")
print(f"Saved → {OUTPUT}")

Saved → C:\Users\sanja\Documents\4th year\7sem\Natural Language Processing\Assignments\Individual\Implementation\character-network-dialogue-sentiment\data\friends_preprocessed.csv


In [17]:
df.head()

Unnamed: 0,speaker,utterance,emotion,annotation,dialogue_id,turn_id,utterance_clean,pos_tags,entities
0,Phoebe,"Oh my God, he's lost it. He's totally lost it.",non-neutral,2120,0,0,oh god lost totally lost,"[(Oh, INTJ), (my, PRON), (God, PROPN), (,, PUN...",[]
1,Monica,What?,surprise,1000130,0,1,what,"[(What, PRON), (?, PUNCT)]",[]
2,Ross,"Or! Or, we could go to the bank, close our acc...",neutral,3000200,0,2,could go bank close account cut source,"[(Or, CCONJ), (!, PUNCT), (Or, CCONJ), (,, PUN...",[]
3,Chandler,You're a genius!,joy,500000,0,3,genius,"[(You, PRON), ('re, AUX), (a, DET), (genius, N...",[]
4,Joey,"Aww, man, now we won't be bank buddies!",sadness,40100,0,4,aww man wo bank buddy,"[(Aww, PROPN), (,, PUNCT), (man, NOUN), (,, PU...",[]
