In [1]:
# Importing required base libraries
import pandas as pd
import re
import numpy as np

# Importing required NLP libraries
import gensim
import spacy
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from spacy.tokenizer import Tokenizer


In [3]:
# Load and check datasets format
df = pd.read_csv('get_follower_data.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,hours,mins,secs,text
0,0,16,30,36,How to UNPIVOT multiple columns into tidy pair...
1,1,16,0,4,"Shortest Path Algorithms, Part 2: Floyd�Warsha..."
2,2,15,8,1,Code to run SQL queries 10 times faster than B...
3,3,14,41,43,New to Data Visualization? Start with New York...
4,4,4,57,47,I wrote a Colab notebook that introduces diffe...


In [4]:
# Clean the text column (striping html with white space)
df['text'] = df['text'].apply(lambda x: x[0:-1].replace('\n\n', ' '))


In [5]:
# Defining a function to clean texts from emojies
def clear_emoji(text):
    '''
    Extracting emojies from texts
    '''
    emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Applying the function to text column in the dataset
df['text'] = df['text'].apply(clear_emoji)


In [12]:
# Using Spacy's large dictionary to extract the most important models
nlp = spacy.load("en_core_web_lg")

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Defining extra stop words
STOP_WORDS = nlp.Defaults.stop_words.union([
    '&amp;', "don't", "i'm", "i've", "it’s", "&gt;", "i’m", 'de',
    'la', 'que', 'un', '=', 'con', 'y', 'like', "you're", 'en', 'el',
    'thank', '+', "don’t", "it's", "⦁", "we're", 'd…', 'los', 'fucking',
    'para', 'del', "here's", "can't", "aren’t"
])

# Tokenizing the tweets
tokens = []

for doc in tokenizer.pipe(df['text'], batch_size=500):
    """
    Update those tokens w/o stopwords
    """

    doc_tokens = []

    for token in doc:
        if (token.text.lower() not in STOP_WORDS)
        & (token.is_punct == False)
        & (token.is_space == False):
            doc_tokens.append(token.text.lower())

    tokens.append(doc_tokens)

df['tokens'] = tokens


In [14]:
# Getting rid of noise, keeping the most useful words
id2word = corpora.Dictionary(df['tokens'])


In [15]:
# Creating a corpus of words to preform the topic modeling 
corpus = [id2word.doc2bow(text) for text in df['tokens']]


In [1]:
# Initiate topic modeling model
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   random_state=723812,
                   num_topics=15,
                   passes=10,
                   workers=8)

lda.print_topics()


NameError: name 'LdaMulticore' is not defined

In [18]:
# Clearing the formating to show words
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]


In [19]:
# Filter number of the topics
topics = [' '.join(t[0:5]) for t in words]


In [20]:
# Print the topics
for id, t in enumerate(topics):
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
new good data learn learning

------ Topic 1 ------
data today new years going

------ Topic 2 ------
people new data free great

------ Topic 3 ------
need data #ai new use

------ Topic 4 ------
day place world r learning

------ Topic 5 ------
love ibm | new good

------ Topic 6 ------
new need covid-19 want learning

------ Topic 7 ------
new data use want great

------ Topic 8 ------
new work data people thanks

------ Topic 9 ------
people think know new time

------ Topic 10 ------
love code people day heard

------ Topic 11 ------
best great way i’ve new

------ Topic 12 ------
game working days people coronavirus

------ Topic 13 ------
time learning #ai data people

------ Topic 14 ------
people data know time ⠀

