In [224]:
import pandas as pd
import numpy as np
import unicodedata

import re
import string

import spacy

import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
!pip install spacy

In [225]:
df_fl = pd.read_excel("Florida_Senate_Sample_data.xlsx")
df_Ca = pd.read_excel("California_Senate_Sample_data.xlsx",sheet_name="Sheet1")

In [226]:
df_fl.head()

Unnamed: 0,user_name,text,created_at,favorite_count
0,Senator Scott Wiener,#GoldBlooded forever,2023-04-30,36
1,Senator Scott Wiener,I had the honor of awarding @SpeakerPelosi the...,2023-04-30,127
2,Senator Scott Wiener,National ban on assault weapons. Now.,2023-04-30,202
3,Senator Scott Wiener,Trip #6 on Muni today for community events — t...,2023-04-30,191
4,Senator Scott Wiener,"RT @Esqueer_: Happening now in Columbus, Ohio....",2023-04-30,0


In [227]:
#create a function to clean tweets
def cleanTxt(text):
    text = text.replace('\u2026','')
    text = re.sub(r'@[A-Za-z0-9_:]+', '', text) #remove @mentions
    text = re.sub(r'RT[\s]+', '', text) # remove ReTweets
    text = re.sub(r'https?:\/\/\S+', '', text) # remove hyperlinks 
    text = re.sub(r'(\n)', '', text)
    text = text.lower()
    
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    text = re.sub(r'[!]+', '', text)  # remove exclamation marks
    # remove emoticons
    text = re.sub(r'[:;=][\)\(\[\]DPOp]([pPoODsS/\\]+[vV])?|[\)\(\[\]DPOp][:=;][vV]?|[Xx][:;=][\)\(\[\]DPOp]|<3', '', text)
    # Remove special characters and symbols
    text = ''.join(c for c in text if unicodedata.category(c) != 'So' and unicodedata.category(c) != 'Sk')
    # Remove additional patterns
    text = re.sub(r'\[[^\]]*\]', '', text)  # remove square brackets and their contents
    text = re.sub(r'’|‘|“|”', '', text)  # remove specific quotation marks
    text = re.sub(r'—|–', '', text)  # remove dashes
    text = re.sub(r'\u2066|\u2069|\u200d', '', text)  # remove specific Unicode characters
    text = re.sub(r'¡|¢|‼️|•', '', text)  # remove specific symbols
    text = re.sub(r'𝟮𝟬𝟮𝟯|️⃣', '', text)  # remove specific Unicode characters
    # remove specific Chinese text
    text = re.sub(r'我和我的团队与加州理发和美容委员会合作，提供简体中文考试！我们的目标是为理发和美容行业的学生和被许可人提供更好的社区访问并减少语言障碍。我们很自豪地宣布这些考试现已生效，所以请分享这个消息', '', text)
    text = re.sub(r'我向蒙特利公园', '', text)  # remove specific Chinese text
    text = re.sub(r'枪击案的受害者和他们的家人表示哀悼。蒙特利公园兰利老年中心', '', text)  # remove specific Chinese text
    text = re.sub(r'提供受害者服务。请到那里寻求帮助', '', text)  # remove specific Chinese text
    text = re.sub(r'新年快乐', '', text)  # remove specific Chinese text
    text = re.sub(r'\u200dfunding', '', text)  # remove specific Unicode characters
    text = re.sub(r'️\u200d️\u200d️thank', '', text)  # remove specific Unicode characters
    text = re.sub(r'\u2066to', '', text)  # remove specific Unicode characters
    text = re.sub(r'\u2066', '', text)  # remove specific Unicode characters
    text = re.sub(r'ab–', '', text)  # remove specific text
    text = re.sub(r'¡ha', '', text)  # remove specific text
    text = re.sub(r'¡feliz', '', text)  # remove specific text
    text = re.sub(r'¡si', '', text)  # remove specific text
    text = re.sub(r'\u2066latest', '', text)  # remove specific Unicode characters
    text = re.sub(r'↓|→', '', text)  # remove arrows
    text = re.sub(r'backs—and|back—now', '', text)  # remove specific phrases
    text = re.sub(r'‼️', '', text)  # remove specific Unicode characters
    text = re.sub(r'•|•transitional|•national', '', text)  # remove specific phrases
    text = re.sub(r"['\[\],\u200b]", "", text)
    text = re.sub(r"['\[\],\u200b\uFE0F\u200D]", "", text)
    text = re.sub(r'[\[\]]', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = re.sub(r'\bu\b|\bca\b', '', text)
    return text

In [228]:
df_fl['text_clean'] = df_fl['text'].apply(cleanTxt)

In [229]:
from nltk.tokenize import word_tokenize

In [230]:
def tokenize_text(text):
    tokens = word_tokenize(text)
    return ' '.join(tokens)

In [231]:
df_fl['tokenized_text'] = df_fl['text_clean'].apply(tokenize_text)

In [232]:
df_fl.head()

Unnamed: 0,user_name,text,created_at,favorite_count,text_clean,tokenized_text
0,Senator Scott Wiener,#GoldBlooded forever,2023-04-30,36,goldblooded forever,goldblooded forever
1,Senator Scott Wiener,I had the honor of awarding @SpeakerPelosi the...,2023-04-30,127,i had the honor of awarding the equality cali...,i had the honor of awarding the equality calif...
2,Senator Scott Wiener,National ban on assault weapons. Now.,2023-04-30,202,national ban on assault weapons now,national ban on assault weapons now
3,Senator Scott Wiener,Trip #6 on Muni today for community events — t...,2023-04-30,191,trip on muni today for community events this...,trip on muni today for community events this o...
4,Senator Scott Wiener,"RT @Esqueer_: Happening now in Columbus, Ohio....",2023-04-30,0,happening now in columbus ohio neo nazis flyi...,happening now in columbus ohio neo nazis flyin...


In [233]:
from nltk.corpus import stopwords

In [234]:
stop_words = stopwords.words('english')
#english_alphabet = [chr(x) for x in range(ord('a'), ord('z')+1)]
stop_words.append('u')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [235]:
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [236]:
df_fl['stopwords_text'] = df_fl['tokenized_text'].apply(remove_stopwords)
df_fl.head()

Unnamed: 0,user_name,text,created_at,favorite_count,text_clean,tokenized_text,stopwords_text
0,Senator Scott Wiener,#GoldBlooded forever,2023-04-30,36,goldblooded forever,goldblooded forever,goldblooded forever
1,Senator Scott Wiener,I had the honor of awarding @SpeakerPelosi the...,2023-04-30,127,i had the honor of awarding the equality cali...,i had the honor of awarding the equality calif...,honor awarding equality california vanguard le...
2,Senator Scott Wiener,National ban on assault weapons. Now.,2023-04-30,202,national ban on assault weapons now,national ban on assault weapons now,national ban assault weapons
3,Senator Scott Wiener,Trip #6 on Muni today for community events — t...,2023-04-30,191,trip on muni today for community events this...,trip on muni today for community events this o...,trip muni today community events one californi...
4,Senator Scott Wiener,"RT @Esqueer_: Happening now in Columbus, Ohio....",2023-04-30,0,happening now in columbus ohio neo nazis flyi...,happening now in columbus ohio neo nazis flyin...,happening columbus ohio neo nazis flying swast...


In [237]:
from nltk.stem import WordNetLemmatizer

In [238]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [239]:
df_fl['lemma_text'] = df_fl['stopwords_text'].apply(preprocess_text)
df_fl.head()

Unnamed: 0,user_name,text,created_at,favorite_count,text_clean,tokenized_text,stopwords_text,lemma_text
0,Senator Scott Wiener,#GoldBlooded forever,2023-04-30,36,goldblooded forever,goldblooded forever,goldblooded forever,goldblooded forever
1,Senator Scott Wiener,I had the honor of awarding @SpeakerPelosi the...,2023-04-30,127,i had the honor of awarding the equality cali...,i had the honor of awarding the equality calif...,honor awarding equality california vanguard le...,honor awarding equality california vanguard le...
2,Senator Scott Wiener,National ban on assault weapons. Now.,2023-04-30,202,national ban on assault weapons now,national ban on assault weapons now,national ban assault weapons,national ban assault weapon
3,Senator Scott Wiener,Trip #6 on Muni today for community events — t...,2023-04-30,191,trip on muni today for community events this...,trip on muni today for community events this o...,trip muni today community events one californi...,trip muni today community event one california...
4,Senator Scott Wiener,"RT @Esqueer_: Happening now in Columbus, Ohio....",2023-04-30,0,happening now in columbus ohio neo nazis flyi...,happening now in columbus ohio neo nazis flyin...,happening columbus ohio neo nazis flying swast...,happening columbus ohio neo nazi flying swasti...


In [240]:
df_fl.shape

(4266, 8)

In [242]:
all_words = []
tweet_words = df_fl['lemma_text'].apply(word_tokenize)
[all_words.extend(x) for x in list(tweet_words)]
print(all_words)



In [243]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary([all_words])

# Create Corpus
texts = [all_words]

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [244]:
corpus

[[(0, 4),
  (1, 1),
  (2, 1),
  (3, 176),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 5),
  (8, 12),
  (9, 2),
  (10, 1),
  (11, 1),
  (12, 12),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 4),
  (17, 8),
  (18, 1),
  (19, 2),
  (20, 6),
  (21, 1),
  (22, 1),
  (23, 2),
  (24, 5),
  (25, 1),
  (26, 1),
  (27, 4),
  (28, 1),
  (29, 2),
  (30, 1),
  (31, 1),
  (32, 3),
  (33, 2),
  (34, 7),
  (35, 1),
  (36, 40),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 3),
  (45, 6),
  (46, 7),
  (47, 8),
  (48, 2),
  (49, 9),
  (50, 1),
  (51, 8),
  (52, 11),
  (53, 2),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 4),
  (61, 7),
  (62, 1),
  (63, 2),
  (64, 1),
  (65, 1),
  (66, 2),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 6),
  (72, 47),
  (73, 1),
  (74, 1),
  (75, 50),
  (76, 35),
  (77, 2),
  (78, 42),
  (79, 1),
  (80, 1),
  (81, 9),
  (82, 3),
  (83, 1),
  (84, 5),
  (85, 8),
  (86, 2),
  (87, 2),
  (88, 10),
  (89, 1),
  (90, 48

In [249]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=4, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [250]:
lda_model.show_topics()

[(0,
  '0.000*"amp" + 0.000*"thank" + 0.000*"today" + 0.000*"bill" + 0.000*"state" + 0.000*"california" + 0.000*"year" + 0.000*"day" + 0.000*"community" + 0.000*"ab"'),
 (1,
  '0.001*"amp" + 0.000*"today" + 0.000*"thank" + 0.000*"year" + 0.000*"bill" + 0.000*"california" + 0.000*"sb" + 0.000*"state" + 0.000*"day" + 0.000*"great"'),
 (2,
  '0.001*"amp" + 0.001*"today" + 0.001*"thank" + 0.001*"bill" + 0.000*"california" + 0.000*"state" + 0.000*"community" + 0.000*"new" + 0.000*"day" + 0.000*"ab"'),
 (3,
  '0.009*"amp" + 0.009*"today" + 0.008*"thank" + 0.007*"california" + 0.007*"state" + 0.006*"bill" + 0.005*"year" + 0.004*"day" + 0.004*"sb" + 0.004*"community"')]

In [248]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
from nltk.probability import FreqDist

word_freq = FreqDist(all_words)

#word_freq
word_freq.most_common(100)

In [None]:
#retrieve word and count from FreqDist tuples

most_common_count = [x[1] for x in word_freq.most_common(100)]
most_common_word = [x[0] for x in word_freq.most_common(100)]

#create dictionary mapping of word count
top_100_dictionary = dict(zip(most_common_word, most_common_count))
top_100_dictionary

In [None]:
from gensim.corpora import Dictionary
#create dictionary
text_dict = Dictionary(all_words)

#view integer mappings
text_dict.token2id

In [None]:
# all_words
tweet_words

In [None]:
tweets_bow = [text_dict.doc2bow(tweet) for tweet in tweet_words]
tweets_bow

In [None]:
from gensim.models.ldamodel import LdaModel

k = 10
tweets_lda = LdaModel(tweets_bow,
                      num_topics = k,
                      id2word = text_dict,
                      random_state = 1,
                      passes=10)



In [None]:
print(tweets_bow)

In [None]:
# Considering 1-15 topics, as the last is cut off
num_topics = list(range(16)[1:])
num_keywords = 15

LDA_models = {}
LDA_topics = {}
for i in num_topics:
    LDA_models[i] = LdaModel(corpus=tweets_bow,
                             id2word=text_dict,
                             num_topics=i,
                             update_every=1,
                             chunksize=len(tweets_bow),
                             passes=20,
                             alpha='auto',
                             random_state=42)

    shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                             num_words=num_keywords,
                                             formatted=False)
    LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]

In [None]:
tweets_bow

In [None]:
def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))

In [None]:
LDA_stability = {}
for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            sims.append(jaccard_similarity(topic1, topic2))    
        
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

In [None]:
from gensim.models import CoherenceModel

In [None]:
coherences = [CoherenceModel(model=LDA_models[i], corpus=tweets_bow, dictionary=text_dict, coherence='u_mass').get_coherence()\
              for i in num_topics[:-1]]

In [None]:
coherences = CoherenceModel(model=LDA_models[1], corpus=tweets_bow, dictionary=text_dict, coherence='u_mass').get_coherence()

In [None]:
coh_sta_diffs = [coherences[i] - mean_stabilities[i] for i in range(num_keywords)[:-1]] # limit topic numbers to the number of keywords
coh_sta_max = max(coh_sta_diffs)
coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max ]
ideal_topic_num_index = coh_sta_max_idxs[0] # choose less topics in case there's more than one max
ideal_topic_num = num_topics[ideal_topic_num_index]

In [None]:
print (coherences)

In [None]:
print(j)

In [None]:
plt.figure(figsize=(20,10))
ax = sns.lineplot(x=num_topics[:-1], y=mean_stabilities, label='Average Topic Overlap')
ax = sns.lineplot(x=num_topics[:-1], y=coherences, label='Topic Coherence')

ax.axvline(x=ideal_topic_num, label='Ideal Number of Topics', color='black')
ax.axvspan(xmin=ideal_topic_num - 1, xmax=ideal_topic_num + 1, alpha=0.5, facecolor='grey')

y_max = max(max(mean_stabilities), max(coherences)) + (0.10 * max(max(mean_stabilities), max(coherences)))
ax.set_ylim([0, y_max])
ax.set_xlim([1, num_topics[-1]-1])
                
ax.axes.set_title('Model Metrics per Number of Topics', fontsize=25)
ax.set_ylabel('Metric Level', fontsize=20)
ax.set_xlabel('Number of Topics', fontsize=20)
plt.legend(fontsize=20)
plt.show()   

In [None]:
tweets_lda.show_topics()

In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(tweets_lda, tweets_bow, dictionary=tweets_lda.id2word)
vis