In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
Predictions_AfterClove = pd.read_csv(r"C:\Users\saisa\OneDrive - Arizona State University\Documents\Courseworks\CIS 518 Big Data\Group Project\llm_predictions-before-clove.csv")

In [5]:
# Filter the dataframe for positive and negative labels
positive_sentences_BeforeClove = Predictions_BeforeClove[Predictions_BeforeClove['Label'].str.contains('positive', case=False, na=False)]['Sentence']

negative_sentences_BeforeClove = Predictions_BeforeClove[Predictions_BeforeClove['Label'].str.contains('negative', case=False, na=False)]['Sentence']

In [6]:
def text_processing(message):
    # Generating the list of words in the message
    def form_sentence(message):
        message = message.lower()
        message_blob = TextBlob(message)
        return ' '.join(message_blob.words)
    
    new_message = form_sentence(message)
    
    # Removing stopwords and words with unusual symbols
    def no_user_alpha(message):
        message_list = [item for item in message.split()]
        clean_words = [word for word in message_list if re.match(r'[^\W\d]*$', word)]
        clean_sentence = ' '.join(clean_words)
        clean_mess = [stopword for stopword in clean_sentence.split() if stopword not in stopwords.words('english')]
        return clean_mess
    
    no_punc_message = no_user_alpha(new_message)
    
    # Normalizing the words in messages
    def normalization(message_list):
        lem = WordNetLemmatizer()
        normalized_message = []
        for word in message_list:
            if word.lower() == 'clove':
                normalized_message.append('clove')
            else:
                normalized_text = lem.lemmatize(word, 'v')
                normalized_message.append(normalized_text)
        return ' '.join(normalized_message)  # Join words into a single string
    
    return normalization(no_punc_message)

In [7]:
# Apply preprocessing with safety checks
def safe_text_processing(text):
    if pd.isna(text) or text.strip() == "":
        return ""
    return text_processing(text)

In [8]:
positive_sentences_preprocessed = positive_sentences_BeforeClove.apply(safe_text_processing)
negative_sentences_preprocessed = negative_sentences_BeforeClove.apply(safe_text_processing)

# Add preprocessed text to DataFrame
Predictions_BeforeClove.loc[Predictions_BeforeClove['Label'].str.contains('positive', case=False, na=False), 'Preprocessed_Sentence'] = positive_sentences_preprocessed
Predictions_BeforeClove.loc[Predictions_BeforeClove['Label'].str.contains('negative', case=False, na=False), 'Preprocessed_Sentence'] = negative_sentences_preprocessed

# Filter the dataframe for preprocessed sentences
positive_sentences_BeforeClove = Predictions_BeforeClove[Predictions_BeforeClove['Label'].str.contains('positive', case=False, na=False)]['Preprocessed_Sentence']
negative_sentences_BeforeClove = Predictions_BeforeClove[Predictions_BeforeClove['Label'].str.contains('negative', case=False, na=False)]['Preprocessed_Sentence']

In [9]:
positive_sentences_BeforeClove.head(5)

2     completely agree lock another smoke support cl...
4     worry last week use time unlock clove test cus...
5                      clove main comp hours click head
8     interest take upon see agent zellsis say defin...
12    work incredibly well viper attack throw usual ...
Name: Preprocessed_Sentence, dtype: object

In [10]:
negative_sentences_BeforeClove.head(5)

0    sorry anyone want see actual clove footage can...
1      open spoil post say footage clove footage clove
3    saw concepts cloves hair like damn go one wors...
6    personally still plan stick omen trust plat te...
7    dont think viable pro play agent stall retake ...
Name: Preprocessed_Sentence, dtype: object

In [11]:
# Define the pipelines for positive and negative labels
positive_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(max_df=0.8, min_df=2, stop_words='english')),
    ('lda', LatentDirichletAllocation(n_components=10, random_state=42))
])

negative_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(max_df=0.8, min_df=2, stop_words='english')),
    ('lda', LatentDirichletAllocation(n_components=10, random_state=42))
])

In [12]:
# Fit the pipelines
positive_pipeline.fit(positive_sentences_BeforeClove.values.astype('U'))
negative_pipeline.fit(negative_sentences_BeforeClove.values.astype('U'))

# Get the fitted LDA models and vectorizers from the pipelines
positive_lda_model = positive_pipeline.named_steps['lda']
positive_vectorizer = positive_pipeline.named_steps['vectorizer']

negative_lda_model = negative_pipeline.named_steps['lda']
negative_vectorizer = negative_pipeline.named_steps['vectorizer']

In [13]:
# Function to display the top words for each topic
def display_top_words(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Top 10 words for topic #{topic_idx}:")
        print([feature_names[i] for i in topic.argsort()[-n_top_words:]])
        print("\n")

In [14]:
# Display top topics for positive labels
print( "Most \"Important\" words for forming topic distribution in Positive Labels Before_Clove:")
positive_top_words = [positive_vectorizer.get_feature_names_out()[i] for i in positive_lda_model.components_[0].argsort()[-10:]]
print(positive_top_words)

# Display top topics for positive labels
print("\nTop 10 Topics in Positive Labels Before_Clove:")
display_top_words(positive_lda_model, positive_vectorizer.get_feature_names_out())

Most "Important" words for forming topic distribution in Positive Labels Before_Clove:
['week', 'pronouns', 'make', 'think', 'like', 'smoke', 'use', 'non', 'binary', 'play']

Top 10 Topics in Positive Labels Before_Clove:
Top 10 words for topic #0:
['week', 'pronouns', 'make', 'think', 'like', 'smoke', 'use', 'non', 'binary', 'play']


Top 10 words for topic #1:
['extra', 'say', 'idk', 'playstyle', 'overheal', 'reyna', 'time', 'really', 'play', 'need']


Top 10 words for topic #2:
['multiple', 'shoot', 'omen', 'site', 'jett', 'say', 'think', 'refer', 'smoke', 'play']


Top 10 words for topic #3:
['base', 'agents', 'site', 'agent', 'think', 'roles', 'come', 'shoot', 'smoke', 'play']


Top 10 words for topic #4:
['free', 'attack', 'agent', 'extra', 'throw', 'lurk', 'death', 'team', 'smoke', 'viper']


Top 10 words for topic #5:
['try', 'probably', 'main', 'ult', 'point', 'people', 'players', 'role', 'controller', 'play']


Top 10 words for topic #6:
['far', 'pronouns', 'know', 'play', 'd

In [15]:
# Display top topics for positive labels
print( "Most \"Important\" words for forming topic distribution in Negative Labels Before_Clove:")
Negative_top_words = [negative_vectorizer.get_feature_names_out()[i] for i in negative_lda_model.components_[0].argsort()[-10:]]
print(Negative_top_words)

# Display top topics for negative labels
print("\nTop 10 Topics in Negative Labels Before_Clove:")
display_top_words(negative_lda_model, negative_vectorizer.get_feature_names_out())


Most "Important" words for forming topic distribution in Negative Labels Before_Clove:
['agree', 'useless', 'die', 'like', 'actual', 'want', 'people', 'footage', 'say', 'wait']

Top 10 Topics in Negative Labels Before_Clove:
Top 10 words for topic #0:
['agree', 'useless', 'die', 'like', 'actual', 'want', 'people', 'footage', 'say', 'wait']


Top 10 words for topic #1:
['footage', 'comms', 'use', 'team', 'actual', 'want', 'retake', 'pronouns', 'multiple', 'say']


Top 10 words for topic #2:
['agree', 'useless', 'die', 'like', 'actual', 'want', 'people', 'footage', 'say', 'wait']


Top 10 words for topic #3:
['death', 'useless', 'run', 'agent', 'limit', 'pretty', 'util', 'op', 'think', 'smoke']


Top 10 words for topic #4:
['footage', 'say', 'wait', 'util', 'players', 'meta', 'hype', 'strong', 'duelist', 'fuck']


Top 10 words for topic #5:
['agree', 'useless', 'die', 'like', 'actual', 'want', 'people', 'footage', 'say', 'wait']


Top 10 words for topic #6:
['say', 'people', 'players', '

In [17]:
Predictions_AfterClove = pd.read_csv(r"C:\Users\saisa\OneDrive - Arizona State University\Documents\Courseworks\CIS 518 Big Data\Group Project\llm_predictions-after-clove.csv")

In [20]:
# Filter the dataframe for positive and negative labels
positive_sentences_AfterClove = Predictions_AfterClove[Predictions_AfterClove['Label'].str.contains('positive', case=False, na=False)]['Sentence']
negative_sentences_AfterClove = Predictions_AfterClove[Predictions_AfterClove['Label'].str.contains('negative', case=False, na=False)]['Sentence']

In [21]:
positive_sentences_preprocessed = positive_sentences_AfterClove.apply(safe_text_processing)
negative_sentences_preprocessed = negative_sentences_AfterClove.apply(safe_text_processing)

# Add preprocessed text to DataFrame
Predictions_AfterClove.loc[Predictions_AfterClove['Label'].str.contains('positive', case=False, na=False), 'Preprocessed_Sentence'] = positive_sentences_preprocessed
Predictions_AfterClove.loc[Predictions_AfterClove['Label'].str.contains('negative', case=False, na=False), 'Preprocessed_Sentence'] = negative_sentences_preprocessed

# Filter the dataframe for preprocessed sentences
positive_sentences_AfterClove = Predictions_AfterClove[Predictions_AfterClove['Label'].str.contains('positive', case=False, na=False)]['Preprocessed_Sentence']
negative_sentences_AfterClove = Predictions_AfterClove[Predictions_AfterClove['Label'].str.contains('negative', case=False, na=False)]['Preprocessed_Sentence']

In [22]:
# Fit the pipelines
positive_pipeline.fit(positive_sentences_AfterClove.values.astype('U'))
negative_pipeline.fit(negative_sentences_AfterClove.values.astype('U'))

# Get the fitted LDA models and vectorizers from the pipelines
positive_lda_model = positive_pipeline.named_steps['lda']
positive_vectorizer = positive_pipeline.named_steps['vectorizer']

negative_lda_model = negative_pipeline.named_steps['lda']
negative_vectorizer = negative_pipeline.named_steps['vectorizer']

In [23]:
# Display top topics for positive labels
print( "Most \"Important\" words for forming topic distribution in Positive Labels After_Clove:")
positive_top_words = [positive_vectorizer.get_feature_names_out()[i] for i in positive_lda_model.components_[0].argsort()[-10:]]
print(positive_top_words)

# Display top topics for positive labels
print("\nTop 10 Topics in Positive Labels After_Clove:")
display_top_words(positive_lda_model, positive_vectorizer.get_feature_names_out())

Most "Important" words for forming topic distribution in Positive Labels After_Clove:
['really', 'die', 'ult', 'heal', 'know', 'play', 'people', 'sage', 'round', 'kill']

Top 10 Topics in Positive Labels After_Clove:
Top 10 words for topic #0:
['really', 'die', 'ult', 'heal', 'know', 'play', 'people', 'sage', 'round', 'kill']


Top 10 words for topic #1:
['death', 'use', 'try', 'entry', 'team', 'better', 'duelist', 'die', 'smoke', 'say']


Top 10 words for topic #2:
['correct', 'person', 'control', 'identify', 'birth', 'nonbinary', 'like', 'use', 'assign', 'make']


Top 10 words for topic #3:
['agent', 'round', 'agents', 'ability', 'team', 'like', 'play', 'good', 'omen', 'smoke']


Top 10 words for topic #4:
['self', 'better', 'omen', 'ult', 'second', 'probably', 'revive', 'team', 'play', 'smoke']


Top 10 words for topic #5:
['feel', 'way', 'na', 'gon', 'im', 'ppl', 'game', 'people', 'like', 'play']


Top 10 words for topic #6:
['use', 'ult', 'smoke', 'good', 'isnt', 'like', 'sec', 'g

In [24]:
# Display top topics for positive labels
print( "Most \"Important\" words for forming topic distribution in Negative Labels After_Clove:")
Negative_top_words = [negative_vectorizer.get_feature_names_out()[i] for i in negative_lda_model.components_[0].argsort()[-10:]]
print(Negative_top_words)

# Display top topics for negative labels
print("\nTop 10 Topics in Negative Labels After_Clove:")
display_top_words(negative_lda_model, negative_vectorizer.get_feature_names_out())


Most "Important" words for forming topic distribution in Negative Labels After_Clove:
['people', 'chamber', 'duelist', 'im', 'play', 'omen', 'let', 'theyre', 'like', 'smoke']

Top 10 Topics in Negative Labels After_Clove:
Top 10 words for topic #0:
['people', 'chamber', 'duelist', 'im', 'play', 'omen', 'let', 'theyre', 'like', 'smoke']


Top 10 words for topic #1:
['game', 'map', 'life', 'ur', 'smoke', 'like', 'omen', 'dont', 'think', 'play']


Top 10 words for topic #2:
['need', 'reyna', 'correct', 'game', 'try', 'say', 'like', 'use', 'play', 'people']


Top 10 words for topic #3:
['think', 'cloves', 'pronouns', 'game', 'character', 'say', 'like', 'fuck', 'people', 'feel']


Top 10 words for topic #4:
['way', 'cloves', 'harm', 'kill', 'like', 'game', 'smoke', 'say', 'think', 'people']


Top 10 words for topic #5:
['omen', 'pro', 'pick', 'good', 'like', 'agent', 'smoke', 'agents', 'team', 'play']


Top 10 words for topic #6:
['play', 'bug', 'round', 'result', 'die', 'game', 'kill', 'ul