# Word2Vec on Friends TV Show

In [8]:
# !pip install convokit gensim pandas matplotlib scikit-learn

In [9]:
from convokit import Corpus, download
corpus = Corpus(filename=download("friends-corpus"))
dialogues = [utt.text for utt in corpus.iter_utterances()]

Dataset already exists at /root/.convokit/saved-corpora/friends-corpus


In [10]:
import re

def clean_text_simple(text):
    text = re.sub(r'\([^)]*\)', '', text)  # remove stage directions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation
    text = text.lower()
    tokens = text.split()
    return tokens

sentences = [clean_text_simple(line) for line in dialogues if isinstance(line, str)]

In [11]:
# !pip install scipy==1.10.1 --upgrade --force-reinstall


In [12]:
# !pip uninstall -y scipy gensim
# !pip install scipy==1.10.1 gensim==4.3.0
from gensim.models import Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)
model.save("friends_word2vec.model")

In [13]:
print(model.wv.most_similar("ross", topn=10))
print(model.wv.similarity("ross", "rachel"))
print(model.wv.doesnt_match(["joey", "phoebe", "chandler", "dragon"]))

[('joey', 0.9022005200386047), ('rachel', 0.8875781893730164), ('monica', 0.871387243270874), ('chandler', 0.8654232025146484), ('phoebe', 0.8528593182563782), ('emily', 0.7565320134162903), ('mike', 0.7006371021270752), ('janice', 0.6963511109352112), ('julie', 0.6807359457015991), ('mark', 0.6688662767410278)]
0.88757825
dragon


In [23]:
import plotly.graph_objs as go
from sklearn.decomposition import PCA
import numpy as np

words = ["ross", "rachel", "joey", "monica", "phoebe", "chandler", "love", "marriage", "baby", "coffee"]
vectors = [model.wv[word] for word in words if word in model.wv]
pca = PCA(n_components=3)
xyz = pca.fit_transform(vectors)
fig = go.Figure(data=[go.Scatter3d(
    x=xyz[:, 0],
    y=xyz[:, 1],
    z=xyz[:, 2],
    mode='markers+text',
    text=words,
    textposition='top center',
    marker=dict(
        size=6,
        color='skyblue',
        opacity=0.8,
        line=dict(color='black', width=1)
    )
)])

fig.update_layout(
    title="Word2Vec - Friends Embeddings (3D PCA)",
    scene=dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3'
    ),
    margin=dict(l=0, r=0, b=0, t=30)
)

fig.show()
