In [8]:
import pandas as pd
from gensim import models, corpora
from langdetect import detect
from nltk.tokenize import word_tokenize
import emoji
import re
import numpy as np
import ast
from nltk.stem import WordNetLemmatizer
import spacy
import gensim
from gensim import corpora


**STEPS**

1) Removing links, emojies, punctuations, numbers and missing values, making them lowercase

2) Language detection

3) Tokenization

4) Removal of Stop Words (finding libraries for all languages)

5) Lemmatization

-----    Preprocessing is over

6) LDA Topic Modeling

In [2]:
data = pd.read_csv("a_chat.csv")
my_list = data['message'].tolist()

**Removing links, emojies, punctuations, numbers and missing values**

In [3]:
cleaned_list = []
for x in my_list:
    x = re.split('https:\/\/.*', str(x))[0] # remove links
    x = ' '.join(re.sub("[0-9.,!?:;\-='\"@#_]", " ", emoji.demojize(x)).split()) # remove punctuation, numbers, and emojies
    if x != 'nan': # check if x is not nan
        cleaned_list.append(x.lower()) # lowercase and append to cleaned_list
cleaned_list = [x for x in cleaned_list if len(x) > 1] # remove single characters


**Lang Detection**

In [None]:
#lang detection of list (if it is more than 50% a language, detect language as that one)
from collections import Counter
def detect_lang(text):
    try:
        return detect(text)
    except:
        return 'unknown'
    
# detect language of each message
lang_list = [detect_lang(x) for x in cleaned_list/100]
# count number of messages in each language
lang_count = Counter(lang_list)
# get most common language
most_common_lang = lang_count.most_common(1)[0][0]
# calculate percentage of most common language
most_common_lang_percentage = lang_count[most_common_lang] / len(cleaned_list)
# if most common language is more than 50% of messages, say that language is the language of the messages
if most_common_lang_percentage > 0.5:
    print("Language of messages is", most_common_lang)
else:
    print("No dominant language detected")

Language of messages is en


**Tokenization**

In [4]:
from nltk.tokenize import word_tokenize
# tokenize each string in the list
tokenized_list = [word_tokenize(x) for x in cleaned_list]
tokenized_list = [x for x in tokenized_list if len(x) > 1] # remove single characters

**Removing Stopwords**

In [9]:

lang_codes = pd.read_csv("lang_codes.csv")
lang_codes["stop_words"] = lang_codes["stop_words"].apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
stop_words_list = lang_codes[lang_codes["languages"] == "en"]["stop_words"].values[0]

In [10]:
for i in range(len(tokenized_list)):
    tokenized_list[i] = [x for x in tokenized_list[i] if x not in stop_words_list] # remove stopwords
    tokenized_list[i] = [x for x in tokenized_list[i] if len(x) > 1] # remove single characters
    tokenized_list[i] = [x for x in tokenized_list[i] if x.isalpha()] # remove non-alphabetic characters

**Lemmatization**

In [11]:
import random
# Calculate the number of sublists you want to select (1% of the total)
num_sublists_to_select = len(tokenized_list) 
# Use random.sample to select sublists randomly without duplicates
selected_sublists = random.sample(tokenized_list, num_sublists_to_select)


In [18]:
nlp = spacy.load("en"+"_core_web_sm")
lemmatized_list = [[token.lemma_ for token in nlp(" ".join(doc))] for doc in selected_sublists]

**LDA TOPIC MODELING**

In [None]:

dictionary = corpora.Dictionary(lemmatized_list)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in lemmatized_list]

# Create the LDA model
lda_model = models.LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=10,  # Define the number of topics you want to extract
    passes= 10    # Number of iterations over the corpus
)

topics = lda_model.print_topics(num_words=20)

for topic in topics:
    words = topic[1].split(" + ")
    print("Topic {}: {}".format(topic[0], ", ".join(word.split("*")[1] for word in words)))

In [51]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/suhedayildirim/nltk_data...


True

In [55]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Analyze sentiment for each document
sentiments = []

for doc in lemmatized_list:
    sentiment = sid.polarity_scores(" ".join(doc))
    sentiments.append(sentiment)


In [None]:
# Create a dictionary to store sentiments for each topic
topic_sentiments = {i: [] for i in range(20)}  # Replace 20 with your actual number of topics

# Associate documents with topics based on the LDA model
topic_assignments = [max(lda_model[doc], key=lambda x: x[1])[0] for doc in doc_term_matrix]

# Group documents by topic and aggregate sentiments
for i, sentiment in enumerate(sentiments):
    topic = topic_assignments[i]
    topic_sentiments[topic].append(sentiment)

# Calculate average sentiment scores for each topic
topic_average_sentiments = {}
for topic, topic_sent_list in topic_sentiments.items():
    if topic_sent_list:
        average_sentiment = {
            'compound': sum(s['compound'] for s in topic_sent_list) / len(topic_sent_list),
            'pos': sum(s['pos'] for s in topic_sent_list) / len(topic_sent_list),
            'neg': sum(s['neg'] for s in topic_sent_list) / len(topic_sent_list),
            'neu': sum(s['neu'] for s in topic_sent_list) / len(topic_sent_list)
        }
        topic_average_sentiments[topic] = average_sentiment

# Print average sentiment scores for each topic
for topic, sentiment in topic_average_sentiments.items():
    print(f"Topic {topic}: Compound: {sentiment['compound']}, Pos: {sentiment['pos']}, Neg: {sentiment['neg']}, Neu: {sentiment['neu']}")
