# Sentiment analysis

For the sentiment analysis, we tried out several different models and pre-processing pipelines. Especially for dealing with comments or descriptions in the lines, like [laughing] or [to camera], we tried out different methods to see which resulted in the best score for the sentiment analysis.

We mainly used sentiment analysis based on pre-trained models, and then tested the accuracy by comparing the predicted sentiment with the sentiments given by us in the annotated sample (of 300 lines).

## 1. Pre-processing pipeline

In [3]:
import pandas as pd
df = pd.read_csv("The_Office_lines.csv")

In [4]:
relevant_columns = ["id","speaker", "line_text"]
df = df[relevant_columns]

In [5]:
import re

# deals with descriptions in lines, e.g. [laughs] or [to camera]
def deal_with_description(line, mode):
    if mode=="remove":
        # remove text that is between brackets
        line = re.sub(r'\[.*?\]', '', line)
    elif mode=="end":
        # move all the text that is in the brackets to the end of the line
        line = re.sub(r'\[.*?\]', '', line) + " " + ", ".join(re.findall(r"\[(.*?)\]", line))
    elif mode=="start":
        # move all the text that is in the brackets to the start of the line
        line = ", ".join(re.findall(r"\[(.*?)\]", line)) + " " + re.sub(r'\[.*?\]', '', line)
    elif mode=="keep":
        # remove all brackets from the line but keep text in place
        line = re.sub(r"[\([{})\]]", '', line)
    return line

def preprocess_sentiment(df, relevant_columns, description_mode):
    # filter out relevant columns
    df = df[relevant_columns]
    # deal with descriptions in lines
    df["line_text"] = df["line_text"].apply(lambda x: deal_with_description(x, mode=description_mode))
    
    return df

In [6]:
df_processed = preprocess_sentiment(df, relevant_columns, description_mode="keep")

## 2. Sentiment analysis

I applied the sentiment analysis first only the the sample labeled by us, and then applied the best performing combination of pipeline and model to the whole dataset.

#### Function to extract ids that have been annotated by us:

In [7]:
def annotated_ids():
    df_luuk = pd.read_csv("annotated_data/sample_Luuk.csv")
    df_shan = pd.read_csv("annotated_data/sample_Shantanu.csv")
    df_elin = pd.read_csv("annotated_data/sample_Eline.csv")

    # combine annotations
    df_combined = pd.concat([df_luuk, df_shan, df_elin], axis=0)

    # filter out only columns that have something in "Sentiment" column
    df_annotated = df_combined[df_combined["Sentiment"].notna()]

    return df_annotated

### Function to test the accuracy of the sentiment analysis

In [8]:
# import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# translating strings of sentiment to integers
trans_dict_roberta = {
    "NEGATIVE": -1,
    "POSITIVE": 1
}

trans_dict_bert = {
    "LABEL_0": -1,
    "LABEL_1": 0,
    "LABEL_2": 1
}

# extract predicted values from dataframe
def extract_ypred(df, source_column, transdict, write=True, target_column="temp"):
    df[target_column] = df[source_column].apply(lambda x: transdict[x[0]["label"]])
    Y_pred = df[target_column].values
    if not write:
        df = df.drop(columns=[target_column])
    return Y_pred

def result_score(Y_val, Y_pred, name, binary=False):
    # make new list replacing 0 with 1 if binary
    if binary:
        Y_val = [1 if x==0 else x for x in Y_val]

    # calculate metrics
    accuracy = accuracy_score(Y_val, Y_pred)
    precision = precision_score(Y_val, Y_pred, average="macro")
    recall = recall_score(Y_val, Y_pred, average="macro")
    f1 = f1_score(Y_val, Y_pred, average="macro")

    # print results
    print(f"Analysis with {name}:\
          \n- - - - - - - - - - \
          \nAccuracy: {accuracy}\
          \nPrecision: {precision}\
          \nRecall: {recall}\
          \nF1: {f1}\n")

### Function to fit sentiment analysis model

In [43]:
# find current time
import time
#supress SettingWithCopyWarning
pd.options.mode.chained_assignment = None

def fit_sentiment(df_filtered, method, name, progress=True):
    # set start time
    start_time = time.time()

    # apply sentiment analysis to each line, track progress
    df_filtered[name] = ""

    # apply sentiment analysis to each line and track progress
    if progress:
        print(f"Fit sentiment analysis {name}")
    k = len(df_filtered)
    for iter, row in df_filtered.iterrows():
        df_filtered[name][iter] = method(row["line_text"])
        if progress:
            print(f"sample {iter+1} out of {k}. {round(iter+1/k*100, 2)}%", end='\x1b[1K\r')

### Sentiment analysis models

In [1]:
# import first pre-trained sentiment analysis pipeline
from transformers import pipeline
sentiment_analysis_roberta = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
sentiment_analysis_bert  = pipeline("sentiment-analysis",model="sbcBI/sentiment_analysis_model")
sentiment_analysis_distilbert = pipeline("sentiment-analysis",model="distilbert-base-uncased-finetuned-sst-2-english")
sentiment_analysis_bert_uncased = pipeline("sentiment-analysis",model="Seethal/sentiment_analysis_generic_dataset")

In [36]:
sentiment_analysis_distilbert("I hate you")

[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]

In [39]:
sentiment_analysis_bert_uncased("I hate you")

[{'label': 'LABEL_0', 'score': 0.9952951073646545}]

For testing, for now, take only the lines that have been annotated by us

In [10]:
# take only annotated lines
df_filtered = annotated_ids()
# reset index
df_filtered = df_filtered.reset_index(drop=True)

In [45]:
fit_sentiment(df_filtered, sentiment_analysis_roberta, "sentiment_analysis_roberta")
fit_sentiment(df_filtered, sentiment_analysis_bert, "sentiment_analysis_bert")
fit_sentiment(df_filtered, sentiment_analysis_distilbert, "sentiment_analysis_distilbert")
fit_sentiment(df_filtered, sentiment_analysis_bert_uncased, "sentiment_analysis_bert_uncased")

Fit sentiment analysis sentiment_analysis_roberta
Fit sentiment analysis sentiment_analysis_bert
Fit sentiment analysis sentiment_analysis_distilbert
Fit sentiment analysis sentiment_analysis_bert_uncased
sample 217 out of 217. 216.46%[1K

In [48]:
# get values of Y_val
Y_val = df_filtered["Sentiment"].values

y_pred_roberta = extract_ypred(df_filtered, "sentiment_analysis_roberta", trans_dict_roberta, write=True, target_column="pred_sentiment_label_roberta")
y_pred_bert = extract_ypred(df_filtered, "sentiment_analysis_bert", trans_dict_bert,  write=True, target_column="pred_sentiment_label_bert")
y_pred_bert_uncased = extract_ypred(df_filtered, "sentiment_analysis_bert_uncased", trans_dict_bert, write=True, target_column="pred_sentiment_label_bert_uncased")
y_pred_distilbert = extract_ypred(df_filtered, "sentiment_analysis_distilbert", trans_dict_roberta, write=True, target_column="pred_sentiment_label_distilbert")

result_score(Y_val, y_pred_roberta, "Roberta", binary=True)
result_score(Y_val, y_pred_distilbert, "DistilBERT", binary=True)
result_score(Y_val, y_pred_bert, "BERT", binary=False)
result_score(Y_val, y_pred_bert_uncased, "BERT Uncased", binary=False)

Analysis with Roberta:          
- - - - - - - - - -           
Accuracy: 0.6359447004608295          
Precision: 0.6626650660264106          
Recall: 0.7210955710955711          
F1: 0.6188157338847753

Analysis with DistilBERT:          
- - - - - - - - - -           
Accuracy: 0.6129032258064516          
Precision: 0.636748844375963          
Recall: 0.6861888111888111          
F1: 0.593850267379679

Analysis with BERT:          
- - - - - - - - - -           
Accuracy: 0.5668202764976958          
Precision: 0.5795834989383376          
Recall: 0.633154960981048          
F1: 0.5713713425978206

Analysis with BERT Uncased:          
- - - - - - - - - -           
Accuracy: 0.6221198156682027          
Precision: 0.605795265792266          
Recall: 0.6024191750278707          
F1: 0.602829144934408



We get a descent accuracy, but a major limitation is that the classifier predicts into two classes (either positive or negative), while we have 3 classes (positive, neutral, negative). I solved this by setting the neutral class labeled by us to positive, but this obviously reduces the accuracy of the model by a lot.