# Sentiment analysis

For the sentiment analysis, we tried out several different models and pre-processing pipelines. Especially for dealing with comments or descriptions in the lines, like [laughing] or [to camera], we tried out different methods to see which resulted in the best score for the sentiment analysis.

We mainly used sentiment analysis based on pre-trained models, and then tested the accuracy by comparing the predicted sentiment with the sentiments given by us in the annotated sample (of 300 lines).

## 1. Pre-processing pipeline

In [2]:
import pandas as pd
df = pd.read_csv("The_Office_lines.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'The_Office_lines.csv'

In [2]:
relevant_columns = ["id","speaker", "line_text"]
df = df[relevant_columns]

In [3]:
import re

# deals with descriptions in lines, e.g. [laughs] or [to camera]
def deal_with_description(line, mode):
    if mode=="remove":
        # remove text that is between brackets
        line = re.sub(r'\[.*?\]', '', line)
    elif mode=="end":
        # move all the text that is in the brackets to the end of the line
        line = re.sub(r'\[.*?\]', '', line) + " " + ", ".join(re.findall(r"\[(.*?)\]", line))
    elif mode=="start":
        # move all the text that is in the brackets to the start of the line
        line = ", ".join(re.findall(r"\[(.*?)\]", line)) + " " + re.sub(r'\[.*?\]', '', line)
    elif mode=="keep":
        # remove all brackets from the line but keep text in place
        line = re.sub(r"[\([{})\]]", '', line)
    return line

def preprocess_sentiment(df, description_mode):
    # deal with descriptions in lines
    df_pre = df.copy()
    df_pre["line_text"] = df_pre["line_text"].apply(lambda x: deal_with_description(x, mode=description_mode))
    
    return df_pre

## 2. Sentiment analysis

I applied the sentiment analysis first only the the sample labeled by us, and then applied the best performing combination of pipeline and model to the whole dataset.

#### Function to extract ids that have been annotated by us:

In [4]:
def annotated_ids():
    df_luuk = pd.read_csv("annotated_data/sample_Luuk.csv")
    df_shan = pd.read_csv("annotated_data/sample_Shantanu.csv")
    df_elin = pd.read_csv("annotated_data/sample_Eline.csv")

    # combine annotations
    df_combined = pd.concat([df_luuk, df_shan, df_elin], axis=0)

    # filter out only columns that have something in "Sentiment" column
    df_annotated = df_combined[df_combined["Sentiment"].notna()]
    df_annotated.reset_index(drop=True, inplace=True)

    return df_annotated

### Function to test the accuracy of the sentiment analysis

In [5]:
# import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error

# translating strings of sentiment to integers
trans_dict_roberta = {
    "NEGATIVE": -1,
    "POSITIVE": 1
}

trans_dict_bert = {
    "LABEL_0": -1,
    "LABEL_1": 0,
    "LABEL_2": 1
}

# extract predicted values from dataframe
def extract_ypred(df, source_column, transdict, write=True, target_column="temp"):
    df[target_column] = df[source_column].apply(lambda x: transdict[x[0]["label"]])
    Y_pred = df[target_column].values
    if not write:
        df = df.drop(columns=[target_column])
    return Y_pred

def result_score(Y_val, Y_pred, name, binary=False):
    # make new list replacing 0 with 1 if binary
    if binary:
        Y_val_used = [1 if x==0 else x for x in Y_val]
    else:
        Y_val_used = Y_val

    # calculate metrics
    accuracy = accuracy_score(Y_val_used, Y_pred)
    precision = precision_score(Y_val_used, Y_pred, average="macro")
    recall = recall_score(Y_val_used, Y_pred, average="macro")
    f1 = f1_score(Y_val_used, Y_pred, average="macro")
    MSE = mean_squared_error(Y_val_used, Y_pred)

    # print results
    print(f"Analysis with {name}:\
          \n- - - - - - - - - - \
          \nAccuracy: {accuracy}\
          \nPrecision: {precision}\
          \nRecall: {recall}\
          \nF1: {f1}\
          \nMSE: {MSE}\n")

### Function to fit sentiment analysis model

In [6]:
# find current time
import time
#supress SettingWithCopyWarning
pd.options.mode.chained_assignment = None

def fit_sentiment(df_filt, method, name, progress=True, ret=False):
    # set start time
    start_time = time.time()

    # apply sentiment analysis to each line, track progress
    df_filtered = df_filt.copy()
    df_filtered[name] = ""

    # apply sentiment analysis to each line and track progress
    if progress:
        print(f"Fit sentiment analysis {name}")
    k = len(df_filtered)
    i = 0
    for iter, row in df_filtered.iterrows():
        df_filtered[name][iter] = method(row["line_text"])
        if progress:
            print(f"sample {iter+1} out of {k}. {round((iter+1)/k*100, 2)}%  ", end='\x1b[1K\r')
        i += 1
    if ret:
        #drop name column
        y_pred = df_filtered[name].values
        df_filtered = df_filtered.drop(columns=[name])
        return y_pred
    else:
        df_filt[name] = df_filtered[name]

### Sentiment analysis models

In [7]:
# import first pre-trained sentiment analysis pipeline
from transformers import pipeline
sentiment_analysis_roberta = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
sentiment_analysis_bert  = pipeline("sentiment-analysis",model="sbcBI/sentiment_analysis_model")
sentiment_analysis_distilbert = pipeline("sentiment-analysis",model="distilbert-base-uncased-finetuned-sst-2-english")
sentiment_analysis_bert_uncased = pipeline("sentiment-analysis",model="Seethal/sentiment_analysis_generic_dataset")

In [8]:
sentiment_analysis_distilbert("I hate you")

[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]

In [9]:
sentiment_analysis_bert_uncased("I hate you")

[{'label': 'LABEL_0', 'score': 0.9952951073646545}]

For testing, for now, take only the lines that have been annotated by us

In [10]:
# take only annotated lines
df_annotated = annotated_ids()

# preprocess the lines
df_filtered = preprocess_sentiment(df_annotated, description_mode="keep")

In [12]:
fit_sentiment(df_filtered, sentiment_analysis_roberta, "sentiment_analysis_roberta", )
fit_sentiment(df_filtered, sentiment_analysis_bert, "sentiment_analysis_bert")
fit_sentiment(df_filtered, sentiment_analysis_distilbert, "sentiment_analysis_distilbert")
fit_sentiment(df_filtered, sentiment_analysis_bert_uncased, "sentiment_analysis_bert_uncased")

Fit sentiment analysis sentiment_analysis_roberta
Fit sentiment analysis sentiment_analysis_bert
Fit sentiment analysis sentiment_analysis_distilbert
Fit sentiment analysis sentiment_analysis_bert_uncased
sample 300 out of 300. 100.0%  [1K

In [13]:
# get values of Y_val
Y_val = df_filtered["Sentiment"].values

y_pred_roberta = extract_ypred(df_filtered, "sentiment_analysis_roberta", trans_dict_roberta, write=True, target_column="pred_sentiment_label_roberta")
y_pred_bert = extract_ypred(df_filtered, "sentiment_analysis_bert", trans_dict_bert,  write=True, target_column="pred_sentiment_label_bert")
y_pred_bert_uncased = extract_ypred(df_filtered, "sentiment_analysis_bert_uncased", trans_dict_bert, write=True, target_column="pred_sentiment_label_bert_uncased")
y_pred_distilbert = extract_ypred(df_filtered, "sentiment_analysis_distilbert", trans_dict_roberta, write=True, target_column="pred_sentiment_label_distilbert")

result_score(Y_val, y_pred_roberta, "Roberta", binary=True)
result_score(Y_val, y_pred_distilbert, "DistilBERT", binary=True)
result_score(Y_val, y_pred_bert, "BERT", binary=False)
result_score(Y_val, y_pred_bert_uncased, "BERT Uncased", binary=False)

Analysis with Roberta:          
- - - - - - - - - -           
Accuracy: 0.6533333333333333          
Precision: 0.6769065424745868          
Recall: 0.7383984068553497          
F1: 0.6368038740920097          
MSE: 1.3866666666666667

Analysis with DistilBERT:          
- - - - - - - - - -           
Accuracy: 0.64          
Precision: 0.655961461260538          
Recall: 0.711001146581377          
F1: 0.6205022488755623          
MSE: 1.44

Analysis with BERT:          
- - - - - - - - - -           
Accuracy: 0.55          
Precision: 0.5586803842381936          
Recall: 0.5925873014044264          
F1: 0.5535982008995503          
MSE: 0.66

Analysis with BERT Uncased:          
- - - - - - - - - -           
Accuracy: 0.6266666666666667          
Precision: 0.6295574458267306          
Recall: 0.605125369156384          
F1: 0.6125925925925926          
MSE: 0.47333333333333333



In [14]:
df_filtered.head()

Unnamed: 0,id,speaker,line_text,Sentiment,sentiment_analysis_roberta,sentiment_analysis_bert,sentiment_analysis_distilbert,sentiment_analysis_bert_uncased,pred_sentiment_label_roberta,pred_sentiment_label_bert,pred_sentiment_label_bert_uncased,pred_sentiment_label_distilbert
0,47252,Gabe,"Ok, but once this starts, it's going to be mov...",-1.0,"[{'label': 'POSITIVE', 'score': 0.983961701393...","[{'label': 'LABEL_0', 'score': 0.7348992228507...","[{'label': 'NEGATIVE', 'score': 0.768441677093...","[{'label': 'LABEL_1', 'score': 0.9918521046638...",1,-1,0,-1
1,15710,Andy,What?,0.0,"[{'label': 'NEGATIVE', 'score': 0.997746646404...","[{'label': 'LABEL_0', 'score': 0.4067431092262...","[{'label': 'NEGATIVE', 'score': 0.993637084960...","[{'label': 'LABEL_1', 'score': 0.9106979966163...",-1,-1,0,-1
2,44150,Dwight,"Just a little announcement folks, remember, th...",1.0,"[{'label': 'POSITIVE', 'score': 0.998121678829...","[{'label': 'LABEL_0', 'score': 0.3897318840026...","[{'label': 'NEGATIVE', 'score': 0.903421878814...","[{'label': 'LABEL_0', 'score': 0.5951375961303...",1,-1,-1,-1
3,45628,Phyllis,Is it true that you're making Dwight the manager?,-1.0,"[{'label': 'NEGATIVE', 'score': 0.993801534175...","[{'label': 'LABEL_1', 'score': 0.4273286163806...","[{'label': 'POSITIVE', 'score': 0.998319804668...","[{'label': 'LABEL_1', 'score': 0.9912397265434...",-1,0,0,1
4,27785,Pam,"Oh, damn. Pam looks down at her salad They've ...",0.0,"[{'label': 'NEGATIVE', 'score': 0.999487280845...","[{'label': 'LABEL_0', 'score': 0.6776348948478...","[{'label': 'NEGATIVE', 'score': 0.991238892078...","[{'label': 'LABEL_0', 'score': 0.9609617590904...",-1,-1,-1,-1


We get a descent accuracy, but a major limitation is that the classifier predicts into two classes (either positive or negative), while we have 3 classes (positive, neutral, negative). I solved this by setting the neutral class labeled by us to positive, but this obviously reduces the accuracy of the model by a lot.

### Combining measures

In [15]:
df_compare = df_filtered[["line_text", "Sentiment", "pred_sentiment_label_roberta", "pred_sentiment_label_bert", "pred_sentiment_label_bert_uncased", "pred_sentiment_label_distilbert"]]

# rename columns
df_compare.columns = ["line_text", "Annotated", "Roberta", "Bert", "Bert_uncased", "Distilbert"]

#### Functions to combine measures

1. Voting: take the majority vote of the different sentiment analyses

In [16]:
# make a new column with the majority vote
df_compare["Majority"] = df_compare[["Roberta", "Bert", "Bert_uncased", "Distilbert"]].mode(axis=1)[0]
df_compare["Majority_minus_BERT"] = df_compare[["Roberta", "Bert_uncased", "Distilbert"]].mode(axis=1)[0]

# make a new column with the average vote
df_compare["Average"] = df_compare[["Roberta", "Bert", "Bert_uncased", "Distilbert"]].mean(axis=1)
df_compare["Average_minus_BERT"] = df_compare[["Roberta", "Bert_uncased", "Distilbert"]].mean(axis=1)

Y_pred_majority = df_compare["Majority"].values
Y_pred_majority_minus = df_compare["Majority_minus_BERT"].values
Y_pred_average = df_compare["Average"].values
Y_pred_average_minus = df_compare["Average_minus_BERT"].values

result_score(Y_val, Y_pred_majority, "Majority", binary=False)
result_score(Y_val, Y_pred_majority_minus, "Majority_minus_BERT", binary=False)

print(f"MSE for Average:\
      \n- - - - - - - - - -\
      \n{mean_squared_error(Y_val, Y_pred_average)}\n")

print(f"MSE for Average minus BERT:\
      \n- - - - - - - - - -\
      \n{mean_squared_error(Y_val, Y_pred_average_minus)}\n")

Analysis with Majority:          
- - - - - - - - - -           
Accuracy: 0.5066666666666667          
Precision: 0.5630272051840679          
Recall: 0.6010088674339743          
F1: 0.500207232201319          
MSE: 0.6933333333333334

Analysis with Majority_minus_BERT:          
- - - - - - - - - -           
Accuracy: 0.43          
Precision: 0.3011531241619737          
Recall: 0.5584697627798195          
F1: 0.3829059829059829          
MSE: 0.83

MSE for Average:      
- - - - - - - - - -      
0.39875

MSE for Average minus BERT:      
- - - - - - - - - -      
0.4288888888888889



  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
df_compare.head()

Unnamed: 0,line_text,Annotated,Roberta,Bert,Bert_uncased,Distilbert,Majority,Majority_minus_BERT,Average,Average_minus_BERT
0,"Ok, but once this starts, it's going to be mov...",-1.0,1,-1,0,-1,-1.0,-1.0,-0.25,0.0
1,What?,0.0,-1,-1,0,-1,-1.0,-1.0,-0.75,-0.666667
2,"Just a little announcement folks, remember, th...",1.0,1,-1,-1,-1,-1.0,-1.0,-0.5,-0.333333
3,Is it true that you're making Dwight the manager?,-1.0,-1,0,0,1,0.0,-1.0,0.0,0.0
4,"Oh, damn. Pam looks down at her salad They've ...",0.0,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0


### Grid search to find best combination of pipeline and model

In [18]:
# Grid search to find the best combination of pipeline and model
pipelines = ["keep", "remove", "start", "end"]
models = {
    "Roberta":sentiment_analysis_roberta,
    "Bert":sentiment_analysis_bert,
    "Bert_uncased":sentiment_analysis_bert_uncased,
    "Distilbert":sentiment_analysis_distilbert
    }
trans_dict_comb = {
    "NEGATIVE": -1,
    "POSITIVE": 1,
    "LABEL_0": -1,
    "LABEL_1": 0,
    "LABEL_2": 1
}
# make dataframe to store results
df_grid = pd.DataFrame(columns=["pipeline", "model", "Accuracy", "Precision", "Recall", "F1", "MSE"])

def grid_search(df, pipelines, models):
    row = 0
    for pipeline in pipelines:
        df_piped = preprocess_sentiment(df, description_mode=pipeline)
        for model in models:
            fit_sentiment(df_piped, models[model], f"{pipeline}, {model}")
            y_pred = extract_ypred(df_piped, f"{pipeline}, {model}", trans_dict_comb, write=False)
            if model in ["Roberta", "Disilbert"]:
                Y_val_used = [1 if x==0 else x for x in Y_val]
            else:
                Y_val_used = Y_val
            acc = accuracy_score(Y_val_used, y_pred)
            precision = precision_score(Y_val_used, y_pred, average="macro")
            recall = recall_score(Y_val_used, y_pred, average="macro")
            f1 = f1_score(Y_val_used, y_pred, average="macro")
            mse = mean_squared_error(Y_val_used, y_pred)
            df_grid.loc[row] = [pipeline, model, acc, precision, recall, f1, mse]
            row += 1

In [19]:
grid_search(df_annotated, pipelines, models)

Fit sentiment analysis keep, Roberta
Fit sentiment analysis keep, Bert1K
Fit sentiment analysis keep, Bert_uncased
Fit sentiment analysis keep, Distilbert
Fit sentiment analysis remove, Roberta


  _warn_prf(average, modifier, msg_start, len(result))


Fit sentiment analysis remove, Bert
Fit sentiment analysis remove, Bert_uncased
Fit sentiment analysis remove, Distilbert
sample 300 out of 300. 100.0%  [1K

  _warn_prf(average, modifier, msg_start, len(result))


Fit sentiment analysis start, Roberta
Fit sentiment analysis start, BertK
Fit sentiment analysis start, Bert_uncased
Fit sentiment analysis start, Distilbert
Fit sentiment analysis end, Roberta


  _warn_prf(average, modifier, msg_start, len(result))


Fit sentiment analysis end, Bert[1K
Fit sentiment analysis end, Bert_uncased
Fit sentiment analysis end, Distilbert
sample 300 out of 300. 100.0%  [1K

  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
df_grid.sort_values(by=['MSE'])

Unnamed: 0,pipeline,model,Accuracy,Precision,Recall,F1,MSE
6,remove,Bert_uncased,0.633333,0.634672,0.609723,0.617883,0.466667
2,keep,Bert_uncased,0.626667,0.629557,0.605125,0.612593,0.473333
14,end,Bert_uncased,0.623333,0.626544,0.598793,0.607373,0.476667
10,start,Bert_uncased,0.62,0.619408,0.59826,0.605123,0.49
5,remove,Bert,0.543333,0.546398,0.57892,0.54687,0.656667
13,end,Bert,0.553333,0.56123,0.594385,0.556685,0.656667
1,keep,Bert,0.55,0.55868,0.592587,0.553598,0.66
9,start,Bert,0.543333,0.551262,0.585722,0.547028,0.666667
11,start,Distilbert,0.426667,0.285481,0.551398,0.375266,0.843333
3,keep,Distilbert,0.423333,0.283643,0.547333,0.372481,0.856667


So we find that Bert_uncased is in fact the best model for this task. Even though the accuracy is slightly lower than the Roberta model, the MSE is way lower. Since we are trying to model sentiment analysis over a longer period of time, and take the average, A lower MSE is more important than a slightly higher accuracy.

### Applying best model to entire dataset

In [23]:
# read in full dataset
df_final = pd.read_csv("The_Office_lines.csv")
# preprocess line_text
df_final["preprocessed"] = df_final["line_text"].apply(lambda x: deal_with_description(x, mode="remove"))

In [26]:
# fit sentiment
fit_sentiment(df_final, sentiment_analysis_bert_uncased, "BERT_uncased_raw")

Fit sentiment analysis BERT_uncased_raw
sample 59909 out of 59909. 100.0%  [1K

In [28]:
# convert to -1, 0, 1
extract_ypred(df_final, "BERT_uncased_raw", trans_dict_comb, write=True, target_column="BERT_uncased_sentiment")

array([ 1,  0,  0, ...,  1, -1,  1])

In [32]:
df_final.head()

Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted,preprocessed,BERT_uncased_raw,BERT_uncased_sentiment
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False,All right Jim. Your quarterlies look very good...,"[{'label': 'LABEL_2', 'score': 0.9752480983734...",1
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False,"Oh, I told you. I couldn't close it. So...","[{'label': 'LABEL_1', 'score': 0.6630714535713...",0
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False,So you've come to the master for guidance? Is ...,"[{'label': 'LABEL_1', 'score': 0.9957032799720...",0
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False,"Actually, you called me in here, but yeah.","[{'label': 'LABEL_1', 'score': 0.9955984354019...",0
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False,"All right. Well, let me show you how it's done.","[{'label': 'LABEL_1', 'score': 0.9973990917205...",0


In [31]:
# save annotated file
df_final.to_csv(f"sample_{annotators[i]}.csv", index=False)

59909

In [3]:
# read in sentiment_labeled data
df_sentiment = pd.read_csv("Sentiment_labeled_data.csv")

In [10]:
#list value counts of BERT_uncased_sentiment column, make into table with percentages
df_sentiment["BERT_uncased_sentiment"].value_counts(normalize=True).to_frame()

Unnamed: 0,BERT_uncased_sentiment
0,0.583986
-1,0.216061
1,0.199953
