In [2]:
# ! pip install transformers

In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda:0")  # Use the first GPU
    print("using GPU")
else:
    device = torch.device("cpu")  # Use CPU if no GPU is available
    print("using CPU")

using GPU


## Importing the data

In [2]:
# compile all reddit posts together

import pandas as pd
import json
import tqdm

datapath = "data/userReddit.csv"
reddit_data = pd.read_csv(datapath)

In [3]:
reddit_data.head()

Unnamed: 0,content,userId,createdTime,sentiment,preprocessed_content
0,Hi I'm Rick Astley. Good to be back here again...,ReallyRickAstley,1697728000.0,"{'neg': 0.0, 'neu': 0.806, 'pos': 0.194, 'comp...",hi im rick astley good back my new album are w...
1,Live stream chat with my new album and upcomin...,ReallyRickAstley,1697647000.0,"{'neg': 0.06, 'neu': 0.713, 'pos': 0.227, 'com...",live stream chat new album upcoming ama hi im ...
2,Coming soon…\n,ReallyRickAstley,1697149000.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",coming soon
3,Rick Astley (me) - Never Gonna Stop [Pop]\n,ReallyRickAstley,1693502000.0,"{'neg': 0.0, 'neu': 0.726, 'pos': 0.274, 'comp...",rick astley never gonna stop pop
4,Rick Astley (me) - Dippin My Feet [Pop]\n,ReallyRickAstley,1689269000.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",rick astley dippin my feet pop


In [4]:
reddit_data.shape

(5523, 5)

In [5]:
sentences = reddit_data['preprocessed_content']
sentences.head(10)

0    hi im rick astley good back my new album are w...
1    live stream chat new album upcoming ama hi im ...
2                                          coming soon
3                     rick astley never gonna stop pop
4                       rick astley dippin my feet pop
5                                    john never giving
6    its great run ive decided devote next ten year...
7                             ready waiting james gunn
8    rick astley either way chris stapleton cover f...
9    never gonna give you up 35 years old today giv...
Name: preprocessed_content, dtype: object

## Classification Model

a. SamLowe/roberta-base-go_emotions \
b. joeddav/distilbert-base-uncased-go-emotions-student

In [6]:
# SamLowe/roberta-base-go_emotions
from transformers import pipeline

rb_classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)

2023-11-13 16:47:16.622177: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
 # i in tqdm.trange(0, len(a1_conversations)):

rb_outputs = []
rb_errors = []
for idx, sent in enumerate(sentences):
  try:
    rb_output = rb_classifier(sent)
    rb_outputs.append(rb_output[0])
  except:
    rb_errors.append(idx)

Token indices sequence length is longer than the specified maximum sequence length for this model (6638 > 512). Running this sequence through the model will result in indexing errors


In [8]:
# Serialize the JSON object to a file
with open('User_Reddit_Classification_Results/rb_outputs.json', 'w') as outfile:
    json.dump(rb_outputs, outfile, indent=4)

# Serialize the list to a JSON file
with open('User_Reddit_Classification_Results/rb_errors.json', 'w') as outfile:
    json.dump(rb_errors, outfile)

In [9]:
# joeddav/distilbert-base-uncased-go-emotions-student
from transformers import pipeline

db_classifier = pipeline(task="text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student", top_k=None)

In [10]:
db_outputs = []
db_errors = []
for idx, sent in enumerate(sentences):
  try:
    db_output = db_classifier(sent)
    db_outputs.append(db_output[0])
  except:
    db_errors.append(idx)

Token indices sequence length is longer than the specified maximum sequence length for this model (5797 > 512). Running this sequence through the model will result in indexing errors


In [11]:
# Serialize the JSON object to a file
with open('User_Reddit_Classification_Results/db_outputs.json', 'w') as outfile:
    json.dump(db_outputs, outfile, indent=4)

# Serialize the list to a JSON file
with open('User_Reddit_Classification_Results/db_errors.json', 'w') as outfile:
    json.dump(db_errors, outfile)

In [12]:
len(rb_outputs), len(rb_errors), len(db_outputs), len(db_errors), 

(5495, 28, 5491, 32)

## Topic Modelling

In [6]:
# !pip install bertopic

In [38]:
reddit_data.drop_duplicates(inplace=True)

In [39]:
import nltk
import re
nltk.download('stopwords')
# Define a function to remove stop words
def remove_stop_words(text):
  stop_words = nltk.corpus.stopwords.words('english')
  return ' '.join([word for word in text.split() if word not in stop_words])

# Define a function to lowercase the text
def lowercase_text(text):
  return text.lower()

# Remove emojis & punctuation
def replace_emojis(text):
  emoji_pattern = r'[^\w\s]'
  text = str(text)
  return re.sub(emoji_pattern, '', text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/msaxena4/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
reddit_data['preprocessed_content1'] = reddit_data['preprocessed_content'].apply(replace_emojis).apply(remove_stop_words).apply(lowercase_text)


In [41]:
def drop_rows_with_empty_text(df):

    df = df.dropna(subset=['content'])
    df = df[df['content'].str.len() >= 1]
    return df

In [42]:
reddit_data = drop_rows_with_empty_text(reddit_data)

In [43]:
reddit_data.head()
sentences = reddit_data['content'][:1000]

In [44]:
from bertopic import BERTopic
# topic_model = BERTopic.load("heyitskim1912/TopicModelling")

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(sentences)


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

2023-11-13 15:49:48,327 - BERTopic - Transformed documents to Embeddings
2023-11-13 15:49:53,250 - BERTopic - Reduced dimensionality
2023-11-13 15:49:53,304 - BERTopic - Clustered reduced embeddings


In [45]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,223,-1_the_to_and_it,"[the, to, and, it, is, this, in, of, do, an]",[[LW] Ethical savescumming; a how-to.\nI thoug...
1,0,203,0_the_and_it_to,"[the, and, it, to, in, that, of, they, is, was]",[i found a game online similar to not pron tha...
2,1,109,1_dollars_albums_coins_x200b,"[dollars, albums, coins, x200b, type, dansco, ...","[[WTB] Odd, Foreign, Out of Print, or Custom D..."
3,2,73,2_rick_astley_video_my,"[rick, astley, video, my, pop, me, love, you, ...",[Rick Astley - Love This Christmas (Official V...
4,3,64,3_in_ernesto_she_who,"[in, ernesto, she, who, liberals, dog, her, to...",[Retirement account beneficiary to children vs...


In [46]:
topic_model.get_topic(0)  # Select the most frequent topic

[('the', 0.03780199106289492),
 ('and', 0.031439268293961),
 ('it', 0.03069503266552148),
 ('to', 0.02749363310180649),
 ('in', 0.026807278280429808),
 ('that', 0.025279121559162458),
 ('of', 0.023042890068609736),
 ('they', 0.02218742148748981),
 ('is', 0.020874491143482696),
 ('was', 0.020087955815640036)]

## transformers-interpret

In [49]:
# ! pip install tqdm

In [6]:
goemotions1 = pd.read_csv("data/goemotions_1.csv")
goemotions1.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [7]:
emotions = ['admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral']
len(emotions)

28

In [8]:
sentences = reddit_data['preprocessed_content']

In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers_interpret import SequenceClassificationExplainer


2023-11-16 14:25:54.155892: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
def sequenceclassification(model, tokenizer, sentence):
    cls_explainer = SequenceClassificationExplainer(model, tokenizer)
    word_attributions = cls_explainer(sentence)
    label = cls_explainer.predicted_class_name
    return word_attributions, label

In [11]:
word_attributions_data = {
    'word' : []
}

for emotion in emotions:
    score_col = emotion + "_score"
    count_count = emotion + "_count"
    word_attributions_data[score_col] = []
    word_attributions_data[count_count] = []

In [12]:
from tqdm import tqdm

model_name = "SamLowe/roberta-base-go_emotions"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

rb_ti_errors = []     # roberta-base transformer interpret

c = 0
for sentence in tqdm(sentences, desc="Processing sentences", unit="sentence"):
    c += 1
    try:
        word_attributions, label = sequenceclassification(model, tokenizer, sentence)
        score_col = label + "_score"
        count_col = label + "_count"
        for item in word_attributions:
            word = item[0]
            score = item[1]
            if item[0] not in word_attributions_data['word']:
                word_attributions_data['word'].append(word)
                idx = word_attributions_data['word'].index(word) # try this by len?
                word_attributions_data[score_col].append(score)
                word_attributions_data[count_col].append(1)
                for emotion in emotions:
                    if emotion != label:
                        other_score_col = emotion + "_score"
                        other_count_col = emotion + "_count"
                        word_attributions_data[other_score_col].append(0.0)
                        word_attributions_data[other_count_col].append(0)
                          
            else:
                idx = word_attributions_data['word'].index(word)
                word_attributions_data[score_col][idx] += score
                word_attributions_data[count_col][idx] += 1
            
    except:
        rb_ti_errors.append(c)

Processing sentences:   1%|▏         | 71/5523 [00:33<3:13:32,  2.13s/sentence]Token indices sequence length is longer than the specified maximum sequence length for this model (6636 > 512). Running this sequence through the model will result in indexing errors
Processing sentences: 100%|██████████| 5523/5523 [1:22:09<00:00,  1.12sentence/s]  


In [13]:
word_attributions_df = pd.DataFrame(word_attributions_data)
# word_attributions_df

Unnamed: 0,word,admiration_score,admiration_count,amusement_score,amusement_count,anger_score,anger_count,annoyance_score,annoyance_count,approval_score,...,relief_score,relief_count,remorse_score,remorse_count,sadness_score,sadness_count,surprise_score,surprise_count,neutral_score,neutral_count
0,<s>,0.000000,281,0.000000,113,0.000000,55,0.000000,83,0.000000,...,0.0,0,0.000000,45,0.000000,155,0.000000,37,0.000000,3405
1,hi,0.002015,2,0.035689,2,0.000000,0,0.000000,0,0.000000,...,0.0,0,0.471796,4,-0.019609,6,0.000000,0,1.322971,46
2,im,3.178083,51,1.657004,97,-0.002995,11,4.864147,69,0.624838,...,0.0,0,0.775257,33,5.088185,231,0.931619,18,-28.446197,439
3,r,-0.347111,21,0.080260,11,0.008351,1,0.266318,10,0.000000,...,0.0,0,0.177270,4,0.081016,8,0.090215,2,-1.290583,317
4,ick,-0.104215,4,0.000000,0,0.000000,0,0.000000,0,0.000000,...,0.0,0,0.000000,0,0.005500,1,0.000000,0,-0.787130,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13968,¿,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,...,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
13969,ı,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,...,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
13970,Ë,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,...,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
13971,¢,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,...,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0


In [59]:
# word_attributions_df['admiration_score'].sort_values(ascending = False, inplace = False)

1. Drop rows with 0 attribution values
2. word cloud for each emotion - better way to analyse 28 word clouds?
3. bar plot of each emotiom - num of words contibuting
4. example words - plot
5. make a tree-like structure for each emotion - venn diagram for top 10 words in each emotion

In [34]:
device = torch.device("cpu")  # Use CPU if no GPU is available
print("using CPU")
device

using CPU


device(type='cpu')

In [35]:
import pandas as pd

# Specify the path where you want to save the CSV file
file_path = 'data/word_attributions_rb.csv'

# Save the DataFrame to CSV
word_attributions_df.to_csv(file_path, index=False)


In [101]:
# for k, v in top_10_words.items():
#     print(k)
#     print(v)