<a href="https://colab.research.google.com/github/rsidorchuk93/sentiment/blob/main/Messages_Sentiment_analysis_with_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment analysis of messages using pretrained Large Language Models - RoBERTa and BERT

In [None]:
# installing package dependencies
pip install transformers

In [None]:
pip install sentencepiece

In [2]:
# importing libraries
import numpy as np
import pandas as pd
from scipy.special import expit
import re

from transformers import pipeline
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

In [3]:
# Load the RoBERTa sentiment analysis model trained on Twitter data (on multiple languages) and sample example
sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment", tokenizer="cardiffnlp/twitter-xlm-roberta-base-sentiment")

In [4]:
sentiment_analyzer("I love this!")

[{'label': 'positive', 'score': 0.9384338855743408}]

In [5]:
# Roberta emotions recognizer - 6 emotions + neutral trained on multiple datasets and sample example, save only top-1 emotion and its score
emotions_6_basic = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=1)

In [6]:
emotions_6_basic("I love this!")

[[{'label': 'joy', 'score': 0.9771687984466553}]]

In [7]:
# Distilled BERT emotions recognizer - granular emotions based on Reddit data, save only top-1 emotion and its score
emotions_granular = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student", top_k=1)

In [8]:
emotions_granular("I love this!")

[[{'label': 'excitement', 'score': 0.17616184055805206}]]

In [9]:
# Tweet topic model for multi-label topic classification and example
MODEL = f"cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
class_mapping = model.config.id2label

text = "It is great to see athletes promoting awareness for climate change."
tokens = tokenizer(text, return_tensors='pt')
output = model(**tokens)

scores = output[0][0].detach().numpy()
scores = expit(scores)
predictions = (scores >= 0.5) * 1

# Get the index of the highest probability score
max_idx = np.argmax(scores)

# Map the index to the corresponding class label
category = class_mapping[max_idx]

# Get the score of the predicted category
score = scores[max_idx]

# Print the predicted category and its score
print(f"Category: {category}, Score: {score}")

Category: sports, Score: 0.9753940105438232


In [10]:
#Read the text file with Whatsapp messages into a list of strings, where each string corresponds to a single message in the file. Messages can be dowloaded from your Whatsapp chat
with open('_chat_friend.txt', 'r') as f:
    messages = f.readlines()

In [11]:
#Split each message string into separate columns for the header, timestamp, sender, and message using regular expressions. 
#The regular expression pattern r'^\[(.+)\] (.+): (.+)$' can be used to match the timestamp, sender, and message in each message string.
data = []
for message in messages:
    match = re.match(r'^\[(.+)\] (.+): (.+)$', message)
    if match:
        timestamp = match.group(1)
        sender = match.group(2)
        message_text = match.group(3)
        data.append([timestamp, sender, message_text])

In [12]:
#Convert the list of message data into a pandas DataFrame and set the column names.
df = pd.DataFrame(data, columns=['timestamp', 'sender', 'message'])

In [13]:
# expand max column width so that we could see more content and explore
pd.set_option('max_colwidth', 300)
df.tail(3)

Unnamed: 0,timestamp,sender,message
370,"12/31/22, 18:21:08",Friend,Happy new year Roman
371,"12/31/22, 18:21:25",Roman,Thank you 🙏
372,"2/5/23, 19:50:12",Friend,‎‎Your security code with Friend changed. ‎Tap to learn more.


In [23]:
# Filter out leaving only longer messages
# Define a function to count the number of words in a message
def count_words(message):
    words = message.split()
    return len(words)

# Apply the function to the 'text' column to get a Boolean mask
mask = df['message'].apply(lambda x: count_words(x) > 10)

# Filter the DataFrame using the mask
filtered_df = df[mask]

# Display the filtered DataFrame
filtered_df.head(5)

Unnamed: 0,timestamp,sender,message
0,"5/21/22, 14:20:25",Friend,"‎Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them."
15,"5/25/22, 18:13:15",Roman,What time you finish barbecue tonight? I will be in Tesoro area
19,"5/25/22, 18:14:52",Roman,I might be either by myself or with a friend or date
24,"5/25/22, 20:55:39",Roman,How is barbecue going? I’ll come with a friend in 15-20
30,"5/26/22, 15:29:39",Roman,I‘ll text you next time I’m in Tesoro area to pick it up


In [24]:
# Calculate sentiment and sentiment score for the messages
# Define a function to apply the sentiment analysis to each row of the DataFrame
def sentiment_analysis(row):
    text = row['message']
    result = sentiment_analyzer(text)
    emotion = result[0]['label']  
    score = result[0]['score']
    return pd.Series({'sentiment': emotion, 'score_score': score})

# Apply the classification function to each row of the DataFrame and add the results as new columns
sentiment_df = filtered_df.apply(sentiment_analysis, axis=1)
filtered_df = pd.concat([filtered_df, sentiment_df], axis=1)

In [25]:
# Define a function to apply the emotion analysis with basic emotions to each row of the DataFrame
def classify_emotion(row):
    text = row['message']
    result = emotions_6_basic(text)
    emotion = result[0][0]['label']  
    score = result[0][0]['score']
    return pd.Series({'top_emotion': emotion, 'score_top_emotion': score})

# Apply the classification function to each row of the DataFrame and add the results as new columns
emotion_df = filtered_df.apply(classify_emotion, axis=1)
filtered_df = pd.concat([filtered_df, emotion_df], axis=1)

In [26]:
# Define a function to apply granular emotion analysis to each row of the DataFrame
def classify_granular_emotion(row):
    text = row['message']
    result = emotions_granular(text)
    emotion = result[0][0]['label']  
    score = result[0][0]['score']
    return pd.Series({'top_emotion_granular': emotion, 'score_top_emotion_granular': score})

# Apply the classification function to each row of the DataFrame and add the results as new columns
emotion_granular_df = filtered_df.apply(classify_granular_emotion, axis=1)
filtered_df = pd.concat([filtered_df, emotion_granular_df], axis=1)

In [27]:
# Define a function to apply the topic classifier to each row of the DataFrame
def classify_topic(row):
    text = row['message']
    tokens = tokenizer(text, return_tensors='pt')
    output = model(**tokens)
    scores = output[0][0].detach().numpy()
    scores = expit(scores)
    max_idx = np.argmax(scores)
    category = class_mapping[max_idx]
    score = scores[max_idx]
    return pd.Series({'top_cat': category, 'score_top_cat': score})
    
# Apply the classification function to each row of the DataFrame and add the results as new columns
topic_df = filtered_df.apply(classify_topic, axis=1)
filtered_df = filtered_df.join(topic_df, rsuffix='_topic')

In [29]:
# filtering out non-neutral messages with higher confidence score
filtered_df2 = filtered_df.loc[(filtered_df['top_emotion'] != 'neutral') & (filtered_df['score_top_emotion'] >= 0.4)]
filtered_df2.head(5)

Unnamed: 0,timestamp,sender,message,sentiment,score_score,top_emotion,score_top_emotion,top_emotion_granular,score_top_emotion_granular,top_cat,score_top_cat
19,"5/25/22, 18:14:52",Roman,I might be either by myself or with a friend or date,neutral,0.724858,joy,0.864997,realization,0.101312,diaries_&_daily_life,0.887935
32,"5/26/22, 15:30:12",Roman,"Thank you again for hosting the barbecue, I enjoyed it a lot",positive,0.942568,joy,0.97919,gratitude,0.352599,food_&_dining,0.877189
62,"6/7/22, 21:13:36",Friend,Thanks for info but i’m already celebrating today “special” day :D,positive,0.833026,joy,0.835772,gratitude,0.234464,diaries_&_daily_life,0.944799
65,"6/7/22, 21:15:16",Roman,I have a birthday on Thursday and then fly back to Miami,neutral,0.745027,joy,0.888197,excitement,0.10641,travel_&_adventure,0.826996
81,"6/8/22, 15:17:18",Roman,"I’m flying back to Miami Fri evening, will start a new job next week and then will feel comfortable there might come back in the summer",positive,0.63637,joy,0.988632,optimism,0.160265,travel_&_adventure,0.83076


# Insights
*   Classified emotions - major 6 emotions (+ neutral), granular emotions, and message topic 

