# Sentence Embeddings

Run this code to get the sentence embeddings for the script data. The file is too large for github, so you can generate it here.

In [12]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import re
import numpy as np

In [3]:
# open dataframe
complete = pd.read_csv('https://scmcqueen.github.io/StarTrekScriptData/complete_data.csv')

# get sentences
sentences = list(complete['quote'])

# preview
sentences[:5]

["   You know, Morn -- there's nothing    quite as invigorating as breakfast    in a bar. Where else can you get    raw slug livers first thing in the    morning?",
 "   What's this?",
 '   What do you mean, "what\'s this?"    It\'s puree of beetle.',
 "   I didn't order it.",
 '   Of course you "didn\'t order it" --    you don\'t need to order it. You    have it after work every morning.']

In [4]:
# load my girl roberta
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [5]:
embeddings = model.encode(sentences)
# takes 4 minutes to run

In [None]:
embedding_df = pd.DataFrame(embeddings)
# embedding_df.to_csv('st_embeddings.csv')
embedding_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.0149,-0.071977,-0.009478,-0.029031,0.014188,0.016394,0.027681,-0.037291,-0.006358,-0.004826,...,0.00111,0.012414,0.030341,-0.003418,-0.008822,0.033325,-0.001755,0.06869,0.02025,0.039753
1,-0.007164,-0.013978,-0.013652,0.003309,0.034851,0.060882,0.048588,0.017505,0.007778,-0.010418,...,0.022243,0.049582,-0.00842,-0.030691,-0.000645,-0.05014,0.083585,-0.046941,-0.02003,0.012742
2,-0.049198,-0.035306,0.008143,0.088346,0.093932,0.025349,-0.01051,-0.015539,-0.002362,-0.013975,...,0.003785,-0.010067,0.007732,-0.049809,0.05156,0.024296,0.035107,-0.066092,-0.020333,-0.009799
3,-0.04359,-0.021896,-0.019405,0.027361,-0.020507,0.03543,-0.040713,0.043625,0.065613,0.002796,...,0.009547,0.010854,-0.027765,0.049275,0.023973,0.004445,0.07236,-0.029266,-0.031327,0.004318
4,-0.053526,0.001219,0.001101,0.03138,-0.008443,0.052395,-0.043175,-0.038514,-0.012206,0.025206,...,-0.040711,0.010254,0.007623,0.042096,-0.003285,0.00207,0.003425,-0.044013,0.021388,0.001928


# Sentence Embeddings

Run this code to get the sentiment for the script data. This takes a long time to run, so I saved a copy as complete_sentiment.csv.

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
import pandas as pd
from spacy.tokens import Doc
import csv

In [None]:
sent_analyzer = SentimentIntensityAnalyzer()
def sentiment_scores(docx):
    global sent_analyzer
    return sent_analyzer.polarity_scores(docx.text)

def sentiment_analysis(input:str):
    global sent_analyzer
    # set the sentiment analysis functions
    nlp = spacy.load("en_core_web_sm")
    return(nlp(input)._.sentimenter['compound'])

In [5]:
# sentiment analysis
Doc.set_extension("sentimenter",getter=sentiment_scores)

In [7]:
# demonstrate function
sentiment_analysis('Data science is so cool')

0.4572

In [13]:
# load the data & neaten it up
complete = pd.read_csv('https://scmcqueen.github.io/StarTrekScriptData/complete_data.csv').reset_index()
complete.columns = ['index','line','character', 'quote', 'scene', 'location', 'view',
       'episode', 'date', 'series', 'file']
complete['quote']=complete['quote'].apply(lambda text: " ".join(text.split()))
complete['character']=complete['character'].fillna('NA')
complete['character']=complete['character'].apply(lambda x: x.replace('"','').replace('\t',''))

I use a sentiment mapping dictionary because it takes a long time to run the sentiment analysis. This way

In [None]:
# open the current sentiment map
with open("sent_map.csv", mode="r") as file:
    reader = csv.DictReader(file)
    sent_data = [row for row in reader][0]  # List of dictionaries

sent_data={int(k):float(v) for k,v in sent_data.items()}

In [21]:
8%6

2

In [None]:
# for each index, check if it is in the sentiment map. If not, calculate it
counter = 0
for x in list(complete['index']):
    print(x)
    counter +=1
    # check if it is already mapped
    if x in sent_data.keys():
        continue

    # get the quote & sentiment
    quote = complete.iloc[x]['quote']
    q_sent = sentiment_analysis(quote)
    sent_data[x]=q_sent

    # if we did 100, save it
    if x%100==0:
        print(x)
        with open("sent_map.csv", "w", newline="") as f:
            w = csv.DictWriter(f, sent_data.keys())
            w.writeheader()
            w.writerow(sent_data)

# once we are done, save it
with open("sent_map.csv", "w", newline="") as f:
            w = csv.DictWriter(f, sent_data.keys())
            w.writeheader()
            w.writerow(sent_data)

<class 'int'>


In [None]:
# save the sentiment data with the full data
complete['sentiment']=complete['index'].apply(lambda x: sent_data[x])
complete.to_csv('complete_sentiment.csv')