# Sentiment analysis

For the sentiment analysis, we tried out several different models and pre-processing pipelines. Especially for dealing with comments or descriptions in the lines, like [laughing] or [to camera], we tried out different methods to see which resulted in the best score for the sentiment analysis.

We mainly used sentiment analysis based on pre-trained models, and then tested the accuracy by comparing the predicted sentiment with the sentiments given by us in the annotated sample (of 300 lines).

## 1. Pre-processing pipeline

In [64]:
import pandas as pd
df = pd.read_csv("The_Office_lines.csv")

In [65]:
relevant_columns = ["id","speaker", "line_text"]
df = df[relevant_columns]

In [170]:
import re

# deals with descriptions in lines, e.g. [laughs] or [to camera]
def deal_with_description(line, mode):
    if mode=="remove":
        # remove text that is between brackets
        line = re.sub(r'\[.*?\]', '', line)
    elif mode=="end":
        # move all the text that is in the brackets to the end of the line
        line = re.sub(r'\[.*?\]', '', line) + " " + ", ".join(re.findall(r"\[(.*?)\]", line))
    elif mode=="start":
        # move all the text that is in the brackets to the start of the line
        line = ", ".join(re.findall(r"\[(.*?)\]", line)) + " " + re.sub(r'\[.*?\]', '', line)
    elif mode=="keep":
        # remove all brackets from the line but keep text in place
        line = re.sub(r"[\([{})\]]", '', line)
    return line

def preprocess_sentiment(df, relevant_columns, description_mode):
    # filter out relevant columns
    df = df[relevant_columns]
    # deal with descriptions in lines
    df["line_text"] = df["line_text"].apply(lambda x: deal_with_description(x, mode=description_mode))
    
    return df

In [171]:
df_filtered = preprocess_sentiment(df, relevant_columns, description_mode="keep")

## 2. Sentiment analysis

I applied the sentiment analysis first only the the sample labeled by us, and then applied the best performing combination of pipeline and model to the whole dataset.

#### Function to extract ids that have been annotated by us:

In [172]:
def annotated_ids():
    df_luuk = pd.read_csv("annotated_data/sample_Luuk.csv")
    df_shan = pd.read_csv("annotated_data/sample_Shantanu.csv")
    df_elin = pd.read_csv("annotated_data/sample_Eline.csv")

    # combine annotations
    df_combined = pd.concat([df_luuk, df_shan, df_elin], axis=0)

    # filter out only columns that have something in "Sentiment" column
    df_annotated = df_combined[df_combined["Sentiment"].notna()]

    return df_annotated

### Function to test the accuracy of the sentiment analysis

In [173]:
# import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# translating strings of sentiment to integers
trans_dict = {
    "NEGATIVE": -1,
    "POSITIVE": 1
}

# extract predicted values from dataframe
def extract_ypred(df, source_column,  write=True, target_column="temp"):
    df[target_column] = df[source_column].apply(lambda x: trans_dict[x[0]["label"]])
    Y_pred = df[target_column].values
    if not write:
        df = df.drop(columns=[target_column])
    return Y_pred

def result_score(Y_val, Y_pred, binary=False):
    # replace neutral values with positive
    if binary:
        Y_val[Y_val==0] = 1

    # calculate metrics
    accuracy = accuracy_score(Y_val, Y_pred)
    precision = precision_score(Y_val, Y_pred, average="macro")
    recall = recall_score(Y_val, Y_pred, average="macro")
    f1 = f1_score(Y_val, Y_pred, average="macro")

    # print results
    print(f"Accuracy: {accuracy}\
          \nPrecision: {precision}\
          \nRecall: {recall}\
          \nF1: {f1}")

### Function to fit sentiment analysis model

In [174]:
# find current time
import time
#supress SettingWithCopyWarning
pd.options.mode.chained_assignment = None

def fit_sentiment(df_filtered, method, name, progress=True):
    # set start time
    start_time = time.time()

    # apply sentiment analysis to each line, track progress
    df_filtered[name] = ""

    # apply sentiment analysis to each line and track progress
    if progress:
        print(f"Fit sentiment analysis {name}")
    k = len(df_filtered)
    for iter, row in df_filtered.iterrows():
        df_filtered[name][iter] = method(row["line_text"])
        if progress:
            print(f"sample {iter} out of {k}. {round(iter/k*100, 2)}%", end='\x1b[1K\r')

### Sentiment analysis models

In [175]:
# import first pre-trained sentiment analysis pipeline
from transformers import pipeline
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

For testing, for now, take only the lines that have been annotated by us

In [178]:
# take only annotated lines
df_filtered = annotated_ids()
# reset index
df_filtered = df_filtered.reset_index(drop=True)

In [179]:
fit_sentiment(df_filtered, sentiment_analysis, "sentiment_analysis")

Fit sentiment analysis sentiment_analysis
sample 216 out of 217. 99.54%[1K

In [166]:
# get values of Y_val
Y_val = df_filtered["Sentiment"].values

y_pred = extract_ypred(df_filtered, "sentiment_analysis", write=True, target_column="pred_sentiment_label")
result_score(Y_val, y_pred, binary=True)

Accuracy: 0.6359447004608295          
Precision: 0.6626650660264106          
Recall: 0.7210955710955711          
F1: 0.6188157338847753
