## Importing dependencies

In [None]:
import pandas as pd
import gensim
import multiprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

## Preprocessing

In [None]:
# importing data
df = pd.read_csv('C:/Users/Rachele/PycharmProjects/topicmodeling/final_tweets.csv', encoding="utf-8")
list_of_tweets = df['tweets'].tolist()

## Removing non-English tweets from dataset

In [None]:
import fasttext as ft

# import pre-trained model
ft_model = ft.load_model("C:/Users/Rachele/Downloads/lid.176.bin")

english_tweets = []

for tweet in list_of_tweets:
    tweet = tweet.replace('\n', " ") # so that fasttext does not throw an error
    prediction = ft_model.predict(tweet)
    label = prediction[0]
    if label == ('__label__en',):
        english_tweets.append(tweet)
    else:
        continue

## Data cleaning, tokenization, stopwording

In [None]:
import re
import string
import emoji
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

# function to remove graphical emojis
def give_emoji_free_text(self, text):
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

# function to remove urls, mentions, hashtags and punctuation, lowercasing, tokenizing and stopwording
def text_cleaning(text):
    global clean_tweets
    clean_tweets = []
    for t in english_tweets:
        clean_tweet = re.sub(r'(?:\@|https?\://)\S+', '', t).lower()  # regex to remove urls and mentions, lowercasing
        clean_tweet = re.sub('#[A-Za-z0-9_]+', '', clean_tweet)  # remove hashtags
        clean_tweet = re.sub(r'([A-Za-z])\1{2,}', r'\1', clean_tweet) # remove characters which are repeated more than twice
        clean_tweet = emoji.replace_emoji(clean_tweet) # remove emojis
        clean_tweet = clean_tweet.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
        clean_tweet = re.sub('([0-9]+)', '', clean_tweet) # remove numbers
        clean_tweets.append(clean_tweet) # create list of cleaned tweets
    tokenizer = TweetTokenizer()
    tokens = []
    for i in clean_tweets:
        token = tokenizer.tokenize(i)
        token = filter(lambda t: len(t) > 3, token) # exclude tokens shorter than 3 characters
        tokens.append(token) # create list of tokens
    stop_words = set(stopwords.words("english"))
    filtered_tweet = []
    tweets = []
    for tweet in tokens:
        filtered_tweet = [w for w in tweet if not w in stop_words] # remove stopwords from list
        tweets.append(filtered_tweet)
    tweets = [x for x in tweets if x != []]
    return tweets

tweets = text_cleaning(english_tweets)

In [None]:
print(tweets[:10])

## Splitting dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(tweets, test_size = 0.2)

In [None]:
print(train[:10])

## Tf-Idf

In [None]:
from gensim import models
from gensim.models import TfidfModel
from gensim import corpora
from gensim.corpora import Dictionary

# create dictionary
corpus = [w for tweet in train for w in tweet] # flattening the nested list
dictionary = corpora.Dictionary([corpus])

# convert corpus into a bag of words
bow = [dictionary.doc2bow(tweet) for tweet in train]

# convert corpus into tf-idf and create the corpus
tfidf = models.TfidfModel(bow, id2word=dictionary)
corpus_tfidf = tfidf[bow]

In [None]:
print(corpus[300:500])

## Finding number of topics for highest coherence score

In [None]:
# compute coherence score
from gensim.models.coherencemodel import CoherenceModel

for i in range(2,21):
    lda_model = models.ldamodel.LdaModel(corpus_tfidf,
                                         num_topics=i,
                                         id2word=dictionary,
                                         update_every=1,
                                         chunksize=100,
                                         passes=2,
                                         alpha='auto')
    coherence_model = CoherenceModel(model=lda_model,
                                     texts=train,
                                     dictionary=dictionary,
                                     coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print('Coherence score with {} clusters: {}'.format(i, coherence_score))

## Creating model

In [None]:
# training lda gensim model with tf-idf
from gensim import models
from gensim.models.ldamodel import LdaModel

lda_model = models.ldamodel.LdaModel(corpus_tfidf,
                                     id2word=dictionary,
                                     num_topics=14, # num of topics with highest coherence score
                                     update_every=1,
                                     chunksize=100,
                                     passes=6,
                                     alpha='auto')

### Retrieving topics with top words

In [None]:
# find the 5 words with the strongest association to the derived topics
for topic_num, words in lda_model.print_topics(num_words=15):
    print('Words in {}: {}.'.format(topic_num, words))

### Computing coherence and perplexity scores

In [None]:
from gensim.models.coherencemodel import CoherenceModel

# compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=train, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
perplexity_lda = lda_model.log_perplexity(corpus_tfidf)
print('\nPerplexity: ', perplexity_lda)  

## Saving model

In [None]:
from gensim.test.utils import datapath

#saving model to disk.

temp_file = datapath("C:/Users/Rachele/Documents/GAlda_topics.model")

lda_model.save(temp_file)

## Loading model

In [None]:
from gensim import models
from gensim.models.ldamodel import LdaModel
from gensim.test.utils import datapath

#saving model to disk.

temp_file = datapath("C:/Users/Rachele/Documents/GAlda_topics.model")

lda_model = LdaModel.load(temp_file)

## Model visualization

In [None]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
p = pyLDAvis.gensim_models.prepare(lda_model, corpus_tfidf, dictionary)
p