In [None]:
# tweet analysis using BERT
# https://medium.com/@hajar.zankadi/using-bertopic-and-bertweet-transformer-to-predict-interest-tag-from-tweets-67189f11b992
#

In [1]:

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# import tweet preprocessor
import preprocessor as p

import re
import pandas as pd
from nltk.corpus import stopwords
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
#from wordcloud import WordCloud
import matplotlib.pyplot as plt


2022-06-14 14:56:38,254 : INFO : Note: NumExpr detected 10 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-06-14 14:56:38,255 : INFO : NumExpr defaulting to 8 threads.


In [2]:
#import dataset
data_tweet= pd.read_csv('../data/clean/NPS_NATL_subset.csv')


In [3]:

data_tweet.shape
data_tweet.head()


Unnamed: 0,Location,Workforce,NPS® Breakdown,respid2,NPS_Code,NPSCommentCleaned,NPSCommentLemmatised,NPSCommentPolarity,NPSCommentSubjectivity,OverallCommentCleaned,OverallCommentLemmatised,OverallCommentPolarity,OverallCommentSubjectivity
0,1224,Precinct,Promoter,6702347,2,because the representative listened to my conc...,Because the representative listened to my conc...,0.0,0.0,did great work,Did great work,0.8,0.75
1,129,Precinct,Promoter,6589878,2,theyre so helpful and knowledgable,They're so helpful and knowledgable,0.0,0.0,katelyn feliciano was so wonderful she was war...,Katelyn Feliciano was so wonderful . She was w...,0.6,0.633333
2,247,Precinct,Promoter,6691534,2,the service requested was preformed quickly an...,The service I requested was preformed quickly ...,0.354167,0.5,xyxyxz,xyxyxz,0.0,0.0
3,377,Autotech,Promoter,7017148,2,cody mitchell is absolutely amazing hes very s...,Cody Mitchell is absolutely amazing . He's ver...,0.444082,0.730658,cody has been absolutely amazing and consider ...,Cody has been absolutely amazing and I conside...,0.8,0.6
4,216,Precinct,Promoter,6794996,2,the careful attention provided by tech staff,The careful attention provided by tech staff.,-0.1,1.0,responsive and respectful,Responsive and respectful.,0.5,0.7


In [4]:
data_tweet = data_tweet.filter(['NPS_Code','NPSCommentCleaned'], axis=1)

#remove duplicates
data_tweet=data_tweet.drop_duplicates()
data_tweet.reset_index(drop=True, inplace=True)

#work with a sample of 10000 tweets
data = data_tweet.iloc[0:10000]

data.head()


Unnamed: 0,NPS_Code,NPSCommentCleaned
0,2,because the representative listened to my conc...
1,2,theyre so helpful and knowledgable
2,2,the service requested was preformed quickly an...
3,2,cody mitchell is absolutely amazing hes very s...
4,2,the careful attention provided by tech staff


In [5]:

# Customizing the preprocessor to exclude removing hashtags since they are valuable as a rich information
p.set_options(p.OPT.URL, p.OPT.EMOJI,p.OPT.MENTION,p.OPT.SMILEY,p.OPT.NUMBER )

#forming a separate feature for cleaned tweets
for i,v in enumerate(data['NPSCommentCleaned']):
    data.loc[i,'text'] = p.clean(str(v))
    
data.head()


Unnamed: 0,NPS_Code,NPSCommentCleaned,text
0,2,because the representative listened to my conc...,because the representative listened to my conc...
1,2,theyre so helpful and knowledgable,theyre so helpful and knowledgable
2,2,the service requested was preformed quickly an...,the service requested was preformed quickly an...
3,2,cody mitchell is absolutely amazing hes very s...,cody mitchell is absolutely amazing hes very s...
4,2,the careful attention provided by tech staff,the careful attention provided by tech staff


In [6]:

#REMOVE ONLY THE '#'NOT THE WORD AFTER
def remove_hashtag_sign(text):
    text = re.sub(r'#', '', text)
    return text

data['text'] = data['text'].apply(lambda x:remove_hashtag_sign(x))


In [7]:

#Remove extra white spaces, punctuation and apply lower casing
data['text'] = data['text'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')
data.head()


Unnamed: 0,NPS_Code,NPSCommentCleaned,text
0,2,because the representative listened to my conc...,because the representative listened to my conc...
1,2,theyre so helpful and knowledgable,theyre so helpful and knowledgable
2,2,the service requested was preformed quickly an...,the service requested was preformed quickly an...
3,2,cody mitchell is absolutely amazing hes very s...,cody mitchell is absolutely amazing hes very s...
4,2,the careful attention provided by tech staff,the careful attention provided by tech staff


In [8]:

#prepare tweet list
tweet_list = data.text.tolist()

#remove english stop words
stopwords = stopwords.words('english') 

#Use CountVectorizer to remove stopwords
vectorizer_model = CountVectorizer(stop_words= stopwords)


In [9]:
topic_model = BERTopic(embedding_model="vinai/bertweet-base", vectorizer_model=vectorizer_model,low_memory=True,calculate_probabilities=True,verbose=True, n_gram_range=(1, 3))


In [11]:
topics, probs = topic_model.fit_transform(tweet_list)


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

IndexError: index out of range in self

In [None]:
topic_model.get_topic_info()


In [None]:
topic_model.get_topic(1)


In [None]:

def get_bert_topics(topic_model, num_topics):
    word_dict = {}
    for i in range(num_topics):
        words=topic_model.get_topic(i)
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    return pd.DataFrame(word_dict)

get_bert_topics(topic_model, len(set(topics))-1)


In [None]:
topic_model.visualize_barchart()


In [None]:

def create_wordcloud(topic_model, topic):
    text = {word: value for word, value in topic_model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title("Topic"+" "+ str(topic))
    plt.show()
 
#visualize the top 3 topics
for i in range(1,4):
    create_wordcloud(topic_model, topic=i)


In [None]:

docs = tweet_list
# Preprocess Documents
documents = pd.DataFrame({"Document": docs,
                          "ID": range(len(docs)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

topic_words = [[words for words, _ in topic_model.get_topic(topic) if words!=''] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
coherence
