In [1]:
!pip install datasets pandas torch openpyxl banglanltk bangla-stemmer gensim pyLDAvis

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting banglanltk
  Downloading banglanltk-0.0.4-py3-none-any.whl.metadata (3.9 kB)
Collecting bangla-stemmer
  Downloading bangla_stemmer-1.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading datasets-3.2.0-py3-none-any

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from datasets import Dataset
import numpy as np
import wandb
from bangla_stemmer.stemmer import stemmer
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models

In [3]:
# Load the dataset
df = pd.read_excel("dataset_rv.xlsx")
stopwords_df = pd.read_excel('stopwords_bangla.xlsx')

stopwords_df = stopwords_df[['words']]

# Map string labels to numerical labels
label_map = {"outrage": 0, "despair": 1, "hope": 2}
df["Sentiment"] = df["Sentiment"].map(label_map)

# Rename the column to text
df.rename(columns={'Headline':'text', 'Sentiment':'labels'}, inplace=True)
stopwords_df.head()

  and should_run_async(code)


Unnamed: 0,words
0,‡¶Ö‡¶á
1,‡¶Ö‡¶ó‡¶§‡ßç‡¶Ø‡¶æ
2,‡¶Ö‡¶§: ‡¶™‡¶∞
3,‡¶Ö‡¶§‡¶è‡¶¨
4,‡¶Ö‡¶•‡¶ö


In [4]:
def english_to_bangla_number_text(number):
    """Converts an English number to Bangla textual representation."""
    bangla_numbers = {
        0: "‡¶∂‡ßÇ‡¶®‡ßç‡¶Ø", 1: "‡¶è‡¶ï", 2: "‡¶¶‡ßÅ‡¶á", 3: "‡¶§‡¶ø‡¶®", 4: "‡¶ö‡¶æ‡¶∞",
        5: "‡¶™‡¶æ‡¶Å‡¶ö", 6: "‡¶õ‡¶Ø‡¶º", 7: "‡¶∏‡¶æ‡¶§", 8: "‡¶Ü‡¶ü", 9: "‡¶®‡¶Ø‡¶º",
        10: "‡¶¶‡¶∂", 11: "‡¶è‡¶ó‡¶æ‡¶∞‡ßã", 12: "‡¶¨‡¶æ‡¶∞‡ßã", 13: "‡¶§‡ßá‡¶∞‡ßã", 14: "‡¶ö‡ßå‡¶¶‡ßç‡¶¶",
        15: "‡¶™‡¶®‡ßá‡¶∞‡ßã", 16: "‡¶∑‡ßã‡¶≤‡ßã", 17: "‡¶∏‡¶§‡ßá‡¶∞‡ßã", 18: "‡¶Ü‡¶†‡¶æ‡¶∞‡ßã", 19: "‡¶â‡¶®‡¶ø‡¶∂",
        20: "‡¶¨‡¶ø‡¶∂", 21: "‡¶è‡¶ï‡ßÅ‡¶∂", 22: "‡¶¨‡¶æ‡¶á‡¶∂", 23: "‡¶§‡ßá‡¶á‡¶∂", 24: "‡¶ö‡¶¨‡ßç‡¶¨‡¶ø‡¶∂",
        25: "‡¶™‡¶Å‡¶ö‡¶ø‡¶∂", 26: "‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂", 27: "‡¶∏‡¶æ‡¶§‡¶æ‡¶∂", 28: "‡¶Ü‡¶ü‡¶æ‡¶∂", 29: "‡¶ä‡¶®‡¶§‡ßç‡¶∞‡¶ø‡¶∂",
        30: "‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 31: "‡¶è‡¶ï‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 32: "‡¶¨‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 33: "‡¶§‡ßá‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 34: "‡¶ö‡ßå‡¶§‡ßç‡¶∞‡¶ø‡¶∂",
        35: "‡¶™‡¶Å‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 36: "‡¶õ‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 37: "‡¶∏‡¶æ‡¶Å‡¶á‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 38: "‡¶Ü‡¶ü‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 39: "‡¶ä‡¶®‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂",
        40: "‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 41: "‡¶è‡¶ï‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 42: "‡¶¨‡¶ø‡¶Ø‡¶º‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 43: "‡¶§‡ßá‡¶§‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 44: "‡¶ö‡ßÅ‡¶Ø‡¶º‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂",
        45: "‡¶™‡¶Å‡¶á‡¶Ø‡¶º‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 46: "‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 47: "‡¶∏‡¶æ‡¶§‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 48: "‡¶Ü‡¶ü‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 49: "‡¶ä‡¶®‡¶™‡¶û‡ßç‡¶ö‡¶æ‡¶∂",
        50: "‡¶™‡¶û‡ßç‡¶ö‡¶æ‡¶∂", 51: "‡¶è‡¶ï‡¶æ‡¶®‡ßç‡¶®", 52: "‡¶¨‡¶æ‡¶π‡¶æ‡¶®‡ßç‡¶®", 53: "‡¶§‡¶ø‡¶™‡ßç‡¶™‡¶æ‡¶®‡ßç‡¶®", 54: "‡¶ö‡ßÅ‡¶Ø‡¶º‡¶æ‡¶®‡ßç‡¶®",
        55: "‡¶™‡¶û‡ßç‡¶ö‡¶æ‡¶®‡ßç‡¶®", 56: "‡¶õ‡¶æ‡¶™‡ßç‡¶™‡¶æ‡¶®‡ßç‡¶®", 57: "‡¶∏‡¶æ‡¶§‡¶æ‡¶®‡ßç‡¶®", 58: "‡¶Ü‡¶ü‡¶æ‡¶®‡ßç‡¶®", 59: "‡¶ä‡¶®‡¶∑‡¶æ‡¶ü",
        60: "‡¶∑‡¶æ‡¶ü", 61: "‡¶è‡¶ï‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 62: "‡¶¨‡¶æ‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 63: "‡¶§‡ßá‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 64: "‡¶ö‡ßå‡¶∑‡¶ü‡ßç‡¶ü‡¶ø",
        65: "‡¶™‡¶Å‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 66: "‡¶õ‡ßá‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 67: "‡¶∏‡¶æ‡¶§‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 68: "‡¶Ü‡¶ü‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 69: "‡¶ä‡¶®‡¶∏‡¶§‡ßç‡¶§‡¶∞",
        70: "‡¶∏‡¶§‡ßç‡¶§‡¶∞", 71: "‡¶è‡¶ï‡¶æ‡¶§‡ßç‡¶§‡¶∞", 72: "‡¶¨‡¶æ‡¶π‡¶æ‡¶§‡ßç‡¶§‡¶∞", 73: "‡¶§‡¶ø‡¶Ø‡¶º‡¶æ‡¶§‡ßç‡¶§‡¶∞", 74: "‡¶ö‡ßÅ‡¶Ø‡¶º‡¶æ‡¶§‡ßç‡¶§‡¶∞",
        75: "‡¶™‡¶Å‡¶ö‡¶æ‡¶§‡ßç‡¶§‡¶∞", 76: "‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶§‡ßç‡¶§‡¶∞", 77: "‡¶∏‡¶æ‡¶§‡¶æ‡¶§‡ßç‡¶§‡¶∞", 78: "‡¶Ü‡¶ü‡¶æ‡¶§‡ßç‡¶§‡¶∞", 79: "‡¶ä‡¶®‡¶Ü‡¶∂‡¶ø",
        80: "‡¶Ü‡¶∂‡¶ø", 81: "‡¶è‡¶ï‡¶æ‡¶∂‡¶ø", 82: "‡¶¨‡¶ø‡¶∞‡¶æ‡¶∂‡¶ø", 83: "‡¶§‡¶ø‡¶∞‡¶æ‡¶∂‡¶ø", 84: "‡¶ö‡ßÅ‡¶∞‡¶æ‡¶∂‡¶ø",
        85: "‡¶™‡¶Å‡¶ö‡¶æ‡¶∂‡¶ø", 86: "‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶∂‡¶ø", 87: "‡¶∏‡¶æ‡¶§‡¶æ‡¶∂‡¶ø", 88: "‡¶Ü‡¶ü‡¶æ‡¶∂‡¶ø", 89: "‡¶ä‡¶®‡¶®‡¶¨‡ßç‡¶¨‡¶á",
        90: "‡¶®‡¶¨‡ßç‡¶¨‡¶á", 91: "‡¶è‡¶ï‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 92: "‡¶¨‡¶ø‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 93: "‡¶§‡¶ø‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 94: "‡¶ö‡ßÅ‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á",
        95: "‡¶™‡¶Å‡¶ö‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 96: "‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 97: "‡¶∏‡¶æ‡¶§‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 98: "‡¶Ü‡¶ü‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 99: "‡¶®‡¶ø‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á",
    }

    def convert_two_digit_number(n):
        """Handles numbers from 0 to 99 based on direct mapping."""
        if n in bangla_numbers:
            return bangla_numbers[n]
        tens = (n // 10) * 10
        units = n % 10
        return f"{bangla_numbers[tens]} {bangla_numbers[units]}" if units > 0 else bangla_numbers[tens]

    if number < 100:
        return convert_two_digit_number(number)
    elif number < 1000:
        hundreds = number // 100
        remainder = number % 100
        if remainder == 0:
            return f"{bangla_numbers[hundreds]} ‡¶∂‡¶§"
        return f"{bangla_numbers[hundreds]} ‡¶∂‡¶§ {convert_two_digit_number(remainder)}"
    elif number < 100000: #Handle numbers from 1000 to 99,999
        parts = []
        if number >= 1000:
            parts.append(f"{english_to_bangla_number_text(number // 1000)} ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞")
            number %= 1000
        if number > 0:
            if number >= 100:
              parts.append(english_to_bangla_number_text(number))
            else:
              parts.append(convert_two_digit_number(number))  #handles cases less than 100
        return " ".join(parts)
    elif number < 10000000: #handles numbers from 100,000 to 9,999,999
      parts = []
      if number >= 100000:
          parts.append(f"{english_to_bangla_number_text(number // 100000)} ‡¶≤‡¶ï‡ßç‡¶∑")
          number %= 100000
      if number > 0:
          if number >= 1000:
            parts.append(english_to_bangla_number_text(number))
          elif number > 0:
            if number >= 100:
                parts.append(english_to_bangla_number_text(number))
            else:
              parts.append(convert_two_digit_number(number)) #handle numbers less than 100
      return " ".join(parts)
    else: # Handle numbers >= 10,000,000 (Crore)
        parts = []
        if number >= 10000000:
            parts.append(f"{english_to_bangla_number_text(number // 10000000)} ‡¶ï‡ßã‡¶ü‡¶ø")
            number %= 10000000
        if number > 0:
            parts.append(english_to_bangla_number_text(number))
        return " ".join(parts)

  and should_run_async(code)


In [5]:
import re

def text_to_word_list(text):
    text = text.split()
    return text

def replace_strings(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\u2600-\u26FF"          # miscellaneous symbols
                               u"\u2700-\u27BF"          # dingbats
                               u"\u2000-\u206F"          # general punctuations
                               "]+", flags=re.UNICODE)
    english_pattern = re.compile('[a-zA-Z]+', flags=re.I)

    text = emoji_pattern.sub(r'', text)
    text = english_pattern.sub(r'', text)

    return text

def remove_punctuations(my_str):
    punctuations = '''````¬£|¬¢|√ë+-*/=EROero‡ß≥‡ß¶‡ßß‡ß®‡ß©‡ß™‡ß´‡ß¨‡ß≠‡ßÆ‡ßØ012‚Äì34567‚Ä¢89‡•§!()-[]{};:'"‚Äú\‚Äô,<>./?@#$%^&*_~‚Äò‚Äî‡••‚Äù‚Ä∞ü§£‚öΩÔ∏è‚úåÔøΩÔø∞‡ß∑Ôø∞'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct += char

    return no_punct

def convert_numbers_to_bangla(text):
    words = text.split()
    converted_words = []
    for word in words:
        if word.isdigit():  # Check if the word is an integer
            bangla_number = english_to_bangla_number_text(int(word))
            converted_words.append(bangla_number)
        else:
            converted_words.append(word)
    return ' '.join(converted_words)

def preprocessing(text):
    text = replace_strings(text)
    text = convert_numbers_to_bangla(text)  # Convert numbers to Bangla
    text = remove_punctuations(text)
    return text

  and should_run_async(code)


In [6]:
# Remove Stopwords
stopwords = set(stopwords_df['words'].tolist())
def stopwordRemoval(text):
    return ' '.join([word for word in text.split() if word not in stopwords])

# stemmer function
def stem_text(x):
  stmr = stemmer.BanglaStemmer()
  words=x.split(' ')
  stm = stmr.stem(words)
  words=(' ').join(stm)
  return words

  and should_run_async(code)


In [7]:
df['text'] =df['text'].apply(lambda x: preprocessing(str(x)))
df.head()

  and should_run_async(code)


Unnamed: 0,text,labels
0,‡¶¢‡¶æ‡¶ï‡¶æ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶¨‡¶ø‡¶¶‡ßç‡¶Ø‡¶æ‡¶≤‡ßü ‡¶ï‡ßã‡¶ü‡¶æ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß‡ßÄ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶®‡ßá‡¶§‡¶æ‡¶ï‡ßá...,0
1,‡¶ï‡ßã‡¶ü‡¶æ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß‡ßÄ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá ‡¶¢‡¶æ‡¶ï‡¶æ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶¨‡¶ø‡¶¶‡ßç‡¶Ø‡¶æ‡¶≤‡ßü‡ßá‡¶∞ ‡¶¨‡¶ø‡¶è‡¶®‡¶™...,2
2,‡¶ï‡ßã‡¶ü‡¶æ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß‡ßÄ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶Ü‡¶ú‡¶ì ‡¶ú‡¶ø‡¶∞‡ßã ‡¶™‡ßü‡ßá‡¶®‡ßç‡¶ü ‡¶Ö‡¶¨‡¶∞‡ßã‡¶ß ‡¶ï‡¶∞‡ßá ...,0
3,‡¶∏‡¶∞‡ßç‡¶¨‡¶ú‡¶®‡ßÄ‡¶® ‡¶™‡ßá‡¶®‡¶∂‡¶® ‡¶∏‡¶∞‡¶ï‡¶æ‡¶∞ ‡¶Ö‡¶®‡ßú ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶ö‡¶æ‡¶≤‡¶ø‡ßü‡ßá ‡¶Ø‡¶æ‡¶¨‡ßá‡¶® ...,0
4,‡¶Ü‡¶ú ‡¶∂‡¶®‡¶ø‡¶¨‡¶æ‡¶∞ ‡¶∏‡¶ï‡¶æ‡¶≤ ‡¶∏‡¶æ‡ßú‡ßá ‡¶ü‡¶æ‡¶∞ ‡¶¶‡¶ø‡¶ï‡ßá ‡¶Æ‡¶π‡¶æ‡¶∏‡ßú‡¶ï‡ßá‡¶∞ ‡¶∂‡¶π‡¶∞ ‡¶¨‡¶æ‡¶á‡¶™...,0


In [8]:
df['text'] =df['text'].apply(lambda x: stopwordRemoval(str(x))) # remove stopwords
df['text'] =df['text'].apply(lambda x: stem_text(str(x))) # stem the text
texts = df['text'].tolist()


  and should_run_async(code)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
applied fourth rules..
applied first rules..
applied first rules..
applied fourth rules..
applied first rules..
applied fourth rules..
applied first rules..
applied third rules..
applied first rules..
applied fourth rules..
applied first rules..
applied second rules..
applied fourth rules..
applied fourth rules..
applied fourth rules..
applied first rules..
applied fourth rules..
applied second rules..
applied fourth rules..
applied second rules..
applied fourth rules..
applied fourth rules..
applied first rules..
applied fourth rules..
applied first rules..
applied fourth rules..
applied fourth rules..
applied fourth rules..
applied first rules..
applied fourth rules..
applied second rules..
applied third rules..
applied first rules..
applied second rules..
applied first rules..
applied fourth rules..
applied fourth rules..
applied first rules..
applied fourth rules..
applied fourth rules..
applied first rules..
applied 

In [9]:
# Tokenize
def tokenized_data(sent):
    tokenized_text = sent.split()
    return tokenized_text

texts = df['text'].tolist()
df['text'] = [tokenized_data(sent) for sent in texts]

  and should_run_async(code)


In [10]:
#Create dictionary
dictionary = Dictionary(df['text'])

  and should_run_async(code)


In [11]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.get_topics()):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

def calculate_topic_diversity(model, num_topics):
    """Calculates topic diversity."""
    topic_terms = []
    for topic_id in range(num_topics):
        topic_terms.append(set([word for word in model.show_topic(topic_id, topn=10)])) #10 from top 10 words
    if len(topic_terms) <= 1:
       return 0  #Diversity is 0 if less than 2 topics
    else:
      return len(set.union(*topic_terms)) / (num_topics * 10)

def run_topic_modeling(data, num_topics, description):
    print(f"\n--- Topic Modeling for: {description} ---")

    # Create Corpus (Bag of Words)
    corpus = [dictionary.doc2bow(text) for text in data]

    # LDA
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)

    # Display topics
    display_topics(lda_model, list(dictionary.token2id.keys()), 10) # Pass dictionary words as features

    # Calculate coherence
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['text'], dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Coherence Score: {coherence_lda:.4f}")


    # Calculate Perplexity
    perplexity = lda_model.log_perplexity(corpus)
    print(f"Perplexity: {perplexity:.4f}")

    # Calculate Log-Likelihood
    log_likelihood = lda_model.bound(corpus)
    print(f"Log-Likelihood: {log_likelihood:.4f}")

    #Calculate Topic Diversity
    topic_diversity = calculate_topic_diversity(lda_model, num_topics)
    print(f"Topic Diversity: {topic_diversity:.4f}")

    # Save the model
    model_filename = f"{description.replace(' ', '_')}_lda_model.model"
    lda_model.save(model_filename)
    print(f"LDA model saved to: {model_filename}")

    return lda_model, corpus # return the corpus and model

  and should_run_async(code)


In [12]:
data_subset = df['text'] # Full dataset model

num_topics = 10
description = "Full DataFrame"

# run topic modeling
lda_model, corpus = run_topic_modeling(data_subset, num_topics, description)

  and should_run_async(code)



--- Topic Modeling for: Full DataFrame ---
Topic 0:
‡¶ï‡¶æ‡¶≤‡ßã‡¶ü‡¶æ‡¶ï‡¶æ ‡¶¨‡¶æ‡¶∏ ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶® ‡¶¨‡ßÅ‡¶ü ‡¶®‡¶ó‡¶∞‡¶¨‡¶æ‡¶∏‡ßÄ ‡¶ó‡ßç‡¶Ø‡¶æ‡¶∞‡ßá‡¶ú ‡¶¨‡ßá‡¶°‡¶º‡ßá ‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶ï‡ßÄ‡¶≠‡¶æ‡¶¨‡ßá ‡¶§‡ßà‡¶∞‡¶ø
Topic 1:
‡¶ú‡¶æ‡¶Æ‡¶æ‡ßü‡¶æ‡¶§ ‡¶Ø‡ßá‡¶≠‡¶æ‡¶¨‡ßá ‡¶¶‡¶æ‡ßü ‡¶¨‡¶®‡ßç‡¶Ø‡¶æ‡¶∞‡ßç‡¶§ ‡¶ó‡ßá‡¶ü ‡¶∏‡¶æ‡¶¶‡¶æ ‡¶™‡ßú‡ßá‡¶õ‡¶ø ‡¶ó‡¶§‡¶ï‡¶æ‡¶≤ ‡¶ñ‡ßÅ‡¶≤‡ßá ‡¶´‡¶æ‡¶∞‡¶æ‡¶ï‡ßç‡¶ï‡¶æ
Topic 2:
‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶Ü‡¶ì‡ßü‡¶æ‡¶Æ‡ßÄ ‡¶∏‡¶∞‡¶ï ‡¶ï‡ßã‡¶ü‡¶æ ‡¶∏‡¶∞‡¶ï‡¶æ‡¶∞‡¶ø ‡¶∞‡¶ø‡¶Æ‡¶æ‡¶®‡ßç‡¶°‡ßá ‡¶Ø‡ßÅ‡¶ó ‡¶∂‡¶´‡¶ø‡¶ï‡ßÅ‡¶∞ ‡¶∑‡ßú‡¶Ø‡¶®‡ßç‡¶§‡ßç‡¶∞ ‡¶¨‡¶¶
Topic 3:
‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá ‡¶ö‡¶æ‡¶Å‡¶¶‡¶æ ‡¶•‡¶æ‡¶ï‡¶æ ‡¶∏‡¶Æ‡ßç‡¶™ ‡¶´‡¶≤‡¶ï ‡¶ü‡ßÅ‡¶∞‡ßç‡¶ï ‡¶®‡¶ú‡¶∞‡ßÅ‡¶≤ ‡¶Ö‡¶¨‡ßà‡¶ß ‡¶¶‡¶æ‡¶¨‡¶ø‡¶∞ ‡¶∂‡¶ø‡¶ï
Topic 4:
‡¶ö‡¶ø‡¶ï‡¶ø‡ßé‡¶∏‡¶ï ‡¶§‡¶¶‡¶®‡ßç ‡¶ï‡¶≤‡ßá‡¶ú ‡¶Æ‡ßá‡¶°‡¶ø‡¶ï‡ßá‡¶≤ ‡¶ö‡¶≤ ‡¶Æ‡¶ø‡¶∞‡ßç‡¶ú‡¶æ ‡¶´‡¶ñ‡¶∞‡ßÅ‡¶≤ ‡¶∏‡¶æ‡¶Å‡¶á‡¶§‡ßç‡¶∞‡¶ø‡¶∂ ‡¶≤‡¶ô‡ßç‡¶ò‡¶® ‡¶∑‡ßú‡¶Ø‡¶®‡ßç‡¶§‡ßç‡¶∞
Topic 5:
‡¶´‡ßá‡¶∞‡¶æ‡¶∞ ‡¶Ü‡¶π‡¶§ ‡¶§‡ßç‡¶∞‡¶æ‡¶£ ‡¶®‡¶ø

In [13]:
import gensim
import pyLDAvis
import pyLDAvis.gensim_models
from gensim.corpora import Dictionary

# Load the saved LDA model and corpus
model_filename = "Full_DataFrame_lda_model.model"

lda_model = gensim.models.LdaModel.load(model_filename)

# Load the saved dictionary
# Create dictionary from the preprocessed text from previous cell.
dictionary = Dictionary(df['text'])

# Load the corpus - it can be regenerated from the dataset
corpus = [dictionary.doc2bow(text) for text in df['text']]



# Prepare the visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Enable notebook mode and display the visualization
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

  and should_run_async(code)


In [14]:
data_subset = df[df['labels']==0]['text'] # outrage model

num_topics = 10
description = "Sentiment_outrage"

# run topic modeling
lda_model, corpus = run_topic_modeling(data_subset, num_topics, description)

  and should_run_async(code)



--- Topic Modeling for: Sentiment_outrage ---
Topic 0:
‡¶¶‡¶æ‡¶¨‡¶ø ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶¨‡¶ø‡¶¶‡ßç‡¶Ø‡¶æ‡¶≤‡ßü ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶∞‡ßç‡¶•‡ßÄ ‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡¶ï‡¶æ‡¶∞‡ßÄ ‡¶ï‡ßã‡¶ü‡¶æ ‡¶∏‡¶Ç‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶ï‡ßç‡¶∑‡ßã‡¶≠ ‡¶∏‡¶Ç‡¶ò‡¶∞‡ßç‡¶∑ ‡¶∏‡¶∞‡¶ï‡¶æ‡¶∞‡¶ø
Topic 1:
‡¶∏‡¶Ç‡¶ò‡¶∞‡ßç‡¶∑ ‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂ ‡¶õ‡¶æ‡¶§‡ßç‡¶∞ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡¶ï‡¶æ‡¶∞‡ßÄ ‡¶¨‡ßà‡¶∑‡¶Æ‡ßç‡¶Ø‡¶¨‡¶ø‡¶∞‡ßã‡¶ß‡ßÄ ‡¶π‡¶æ‡¶Æ‡¶≤‡¶æ ‡¶®‡¶æ ‡¶∏‡¶ø‡¶≤‡ßá‡¶ü ‡¶ö‡¶æ‡¶≤‡¶æ
Topic 2:
‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶∞‡ßç‡¶•‡ßÄ ‡¶¶‡¶æ‡¶¨‡¶ø ‡¶≤‡ßÄ‡¶ó ‡¶ï‡ßã‡¶ü‡¶æ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡¶ï‡¶æ‡¶∞‡ßÄ ‡¶∏‡¶Æ‡¶®‡ßç‡¶¨‡ßü‡¶ï ‡¶ï‡¶∞‡ßç‡¶Æ‡¶∏‡ßÇ‡¶ö‡¶ø ‡¶®‡¶æ ‡¶Ü‡¶ì‡ßü‡¶æ‡¶Æ‡ßÄ
Topic 3:
‡¶¶‡¶æ‡¶¨‡¶ø ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶¨‡¶ø‡¶¶‡ßç‡¶Ø‡¶æ‡¶≤‡ßü ‡¶¢‡¶æ‡¶ï‡¶æ ‡¶¨‡¶ø‡¶ï‡ßç‡¶∑‡ßã‡¶≠ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡¶ï‡¶æ‡¶∞‡ßÄ ‡¶Ü‡¶ì‡ßü‡¶æ‡¶Æ‡ßÄ ‡¶≤‡ßÄ‡¶ó ‡¶è‡¶≤‡¶æ‡¶ï‡¶æ‡ßü ‡¶Æ‡¶ø‡¶õ‡¶ø‡¶≤
Topic 4:
‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶∞‡ßç‡¶•‡ßÄ ‡¶¶‡¶æ‡¶¨‡¶ø ‡¶Ö‡¶¨‡¶∞‡ßã‡¶ß ‡¶∏‡¶Ç‡¶ò‡¶∞‡ßç‡¶∑ 

In [15]:
# Load the saved LDA model and corpus
model_filename = "Sentiment_outrage_lda_model.model"


lda_model = gensim.models.LdaModel.load(model_filename)

# Load the saved dictionary (Assuming you have saved it)
# Create dictionary from the preprocessed text from previous cell.
dictionary = Dictionary(df['text'])

# Load the corpus - it can be regenerated from the dataset
corpus = [dictionary.doc2bow(text) for text in df['text']]



# Prepare the visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Enable notebook mode and display the visualization
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

  and should_run_async(code)


In [16]:
data_subset = df[df['labels']==1]['text'] # despair model

num_topics = 10
description = "Sentiment_despair"

# run topic modeling
lda_model, corpus = run_topic_modeling(data_subset, num_topics, description)

  and should_run_async(code)



--- Topic Modeling for: Sentiment_despair ---
Topic 0:
‡¶¨‡¶®‡ßç‡¶Ø‡¶æ ‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂ ‡¶™‡¶∞‡¶ø‡¶∏‡ßç‡¶•‡¶ø‡¶§‡¶ø‡¶∞ ‡¶Ö‡¶¨‡¶®‡¶§‡¶ø ‡¶ï‡ßã‡¶ü‡¶æ ‡¶∂‡ßá‡¶ñ ‡¶´‡ßá‡¶®‡ßÄ ‡¶¨‡¶®‡ßç‡¶Ø ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶¨‡¶ø‡¶¶‡ßç‡¶Ø‡¶æ‡¶≤‡ßü
Topic 1:
‡¶®‡¶¶‡ßÄ‡¶∞ ‡¶®‡¶ø‡¶π‡¶§ ‡¶¨‡¶æ‡¶Å‡¶ß ‡¶õ‡¶æ‡¶§‡ßç‡¶∞‡¶≤‡ßÄ‡¶ó ‡¶¨‡¶®‡ßç‡¶Ø‡¶æ ‡¶π‡¶æ‡¶Æ‡¶≤‡¶æ ‡¶ó‡ßç‡¶∞‡¶æ‡¶Æ ‡¶ó‡ßã‡¶Æ‡¶§‡ßÄ ‡¶ß‡¶æ‡¶ì‡ßü‡¶æ ‡¶¨‡¶®‡ßç‡¶Ø‡¶æ‡ßü
Topic 2:
‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶®‡¶æ ‡¶∏‡¶∞‡¶ï ‡¶¨‡¶®‡ßç‡¶Ø‡¶æ ‡¶ï‡ßã‡¶ü‡¶æ ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶∞‡ßç‡¶•‡ßÄ ‡¶≤‡ßÄ‡¶ó ‡¶∏‡¶Æ‡ßü ‡¶™‡¶§‡¶® ‡¶¶‡ßá‡¶∂‡ßá‡¶∞
Topic 3:
‡¶®‡¶æ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶¨‡¶ø‡¶¶‡ßç‡¶Ø‡¶æ‡¶≤‡ßü ‡¶¢‡¶æ‡¶ï‡¶æ ‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂ ‡¶π‡¶æ‡¶Æ‡¶≤‡¶æ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡¶ï‡¶æ‡¶∞‡ßÄ ‡¶ï‡ßã‡¶ü‡¶æ ‡¶∂‡ßá‡¶ñ ‡¶Ü‡¶ì‡ßü‡¶æ‡¶Æ‡ßÄ ‡¶õ‡¶æ‡¶§‡ßç‡¶∞‡¶≤‡ßÄ‡¶ó
Topic 4:
‡¶¢‡¶æ‡¶ï‡¶æ ‡¶®‡¶æ ‡¶π‡¶æ‡¶Æ‡¶≤‡¶æ ‡¶¶‡ßá‡¶∂ ‡¶®‡ßá‡¶§‡¶æ‡¶ï‡¶∞‡ßç‡¶Æ‡ßÄ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡¶ï‡¶æ‡¶∞‡ßÄ ‡¶õ‡¶æ‡¶§‡ßç‡¶∞ ‡¶ï‡ßã‡¶ü‡¶æ ‡¶∂‡ßá‡¶ñ ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶∞‡ßç‡¶•‡ßÄ
Topic 5:
‡¶™‡¶æ‡¶®‡¶ø ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶

In [17]:
# Load the saved LDA model and corpus
model_filename = "Sentiment_despair_lda_model.model"


lda_model = gensim.models.LdaModel.load(model_filename)

# Load the saved dictionary (Assuming you have saved it)
# Create dictionary from the preprocessed text from previous cell.
dictionary = Dictionary(df['text'])

# Load the corpus - it can be regenerated from the dataset
corpus = [dictionary.doc2bow(text) for text in df['text']]



# Prepare the visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Enable notebook mode and display the visualization
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

  and should_run_async(code)


In [18]:
data_subset = df[df['labels']==2]['text'] # hope model

num_topics = 10
description = "Sentiment_hope"

# run topic modeling
lda_model, corpus = run_topic_modeling(data_subset, num_topics, description)

  and should_run_async(code)



--- Topic Modeling for: Sentiment_hope ---
Topic 0:
‡¶â‡¶™‡¶¶‡ßá‡¶∑‡ßç ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶° ‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂ ‡¶õ‡¶æ‡¶§‡ßç‡¶∞ ‡¶¶‡¶≤ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶∂‡ßá‡¶ñ ‡¶®‡¶ø‡¶∑‡¶ø‡¶¶‡ßç‡¶ß
Topic 1:
‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶∞‡ßç‡¶•‡ßÄ ‡¶â‡¶™‡¶¶‡ßá‡¶∑‡ßç ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶¨‡¶ø‡¶∞‡ßÅ‡¶¶‡ßç‡¶ß‡ßá ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶® ‡¶ï‡¶•‡¶æ ‡¶Ü‡¶≤‡ßã ‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶¨‡¶∞‡ßç‡¶§‡ßÄ ‡¶ï‡¶æ‡¶∞‡ßç‡¶Ø‡¶ï‡ßç‡¶∞‡¶Æ ‡¶â‡¶¶‡ßç‡¶Ø‡ßã‡¶ó
Topic 2:
‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶â‡¶™‡¶¶‡ßá‡¶∑‡ßç ‡¶∏‡¶∞‡¶ï‡¶æ‡¶∞ ‡¶∂‡ßá‡¶ñ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶¨‡¶∞‡ßç‡¶§‡ßÄ‡¶ï‡¶æ‡¶≤‡ßÄ‡¶® ‡¶®‡¶æ ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶∞‡ßç‡¶•‡ßÄ ‡¶õ‡¶æ‡¶§‡ßç‡¶∞ ‡¶ï‡ßã‡¶ü‡¶æ
Topic 3:
‡¶∏‡¶∞‡¶ï‡¶æ‡¶∞ ‡¶®‡¶æ ‡¶° ‡¶á‡¶â‡¶®‡ßÇ‡¶∏ ‡¶â‡¶™‡¶¶‡ßá‡¶∑‡ßç ‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶¨‡¶∞‡ßç‡¶§‡ßÄ‡¶ï‡¶æ‡¶≤‡ßÄ‡¶® ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶¨‡¶ø‡¶¶‡ßç‡¶Ø‡¶æ‡¶≤‡ßü ‡¶∏‡¶ø‡¶¶‡ßç‡¶ß‡¶æ‡¶®‡ßç‡¶§ ‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶¨‡¶∞‡ßç‡¶§‡ßÄ
Topic 4:
‡¶∏‡¶∞‡¶ï‡¶æ‡¶∞ ‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶¨‡¶∞‡ßç‡¶§‡ßÄ‡¶ï‡¶æ‡¶≤‡ßÄ‡¶® ‡¶° ‡¶∂‡ßá‡¶ñ ‡¶â‡¶™‡¶¶‡ßá

In [19]:
# Load the saved LDA model and corpus
model_filename = "Sentiment_hope_lda_model.model"


lda_model = gensim.models.LdaModel.load(model_filename)

# Load the saved dictionary (Assuming you have saved it)
# Create dictionary from the preprocessed text from previous cell.
dictionary = Dictionary(df['text'])

# Load the corpus - it can be regenerated from the dataset
corpus = [dictionary.doc2bow(text) for text in df['text']]


# Prepare the visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Enable notebook mode and display the visualization
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

  and should_run_async(code)
