In [56]:
import pandas as pd 
import re
import json
from nltk.tag import pos_tag
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
import gensim
from gensim import corpora, models
import math
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
p_stemmer = PorterStemmer()

In [72]:
df = pd.read_csv("sample.csv")
df.info()

## FOR TESTING ONLY
# df = df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29047 entries, 0 to 29046
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    29047 non-null  int64 
 1   title         29047 non-null  object
 2   url           29047 non-null  object
 3   crawled_time  29047 non-null  object
 4   date          29047 non-null  object
 5   domain        29047 non-null  object
 6   author        19635 non-null  object
 7   content       29047 non-null  object
 8   topic_area    29047 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.0+ MB


In [73]:
## Remove stop words and tokenise and stem
def clean_content_SLOW(row):
    content = re.sub("[^0-9a-zA-Z\&]+", " ", row['content']).split(" ")
    processed = [stemmer.stem(lemmatizer.lemmatize(x.lower(), pos='v')) for x in content if x not in stopwords and x.strip() != ""]
    return processed


def clean_content_FAST(row):
    content = re.sub("[^a-zA-Z\&]+", " ", row['content']).split(" ")
    processed = [p_stemmer.stem(x.lower()) for x in content if x not in stopwords and x.strip() != ""]
    return processed


df["processed"] = df.apply(lambda row: clean_content_SLOW(row), axis=1)

In [74]:
dictionary = corpora.Dictionary(df["processed"])
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
print (dictionary)

Dictionary(17086 unique tokens: ['&', '0&placement', '0&userid', '02', '1']...)


In [75]:
bow_corpus = [dictionary.doc2bow(doc) for doc in df["processed"]]
# bow_corpus[69]

# TF-IDF TIME

In [76]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

## Print this if you want see 
# for doc in corpus_tfidf:
#     print (doc)

In [77]:
lda_model = models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [78]:
for idx, topic in lda_model.print_topics(-2):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"trump" + 0.003*"test" + 0.003*"case" + 0.003*"travel" + 0.002*"state" + 0.002*"countri" + 0.002*"flight" + 0.002*"presid" + 0.002*"china" + 0.002*"offici"
Topic: 1 Word: 0.004*"oil" + 0.004*"0" + 0.003*"bank" + 0.003*"market" + 0.003*"price" + 0.003*"1" + 0.003*"stock" + 0.002*"million" + 0.002*"2019" + 0.002*"quarter"
Topic: 2 Word: 0.002*"gm" + 0.002*"volkswagen" + 0.002*"china" + 0.002*"automak" + 0.002*"chines" + 0.002*"plant" + 0.001*"wuhan" + 0.001*"tesla" + 0.001*"car" + 0.001*"ford"
Topic: 3 Word: 0.003*"china" + 0.002*"chines" + 0.002*"de" + 0.002*"case" + 0.002*"wuhan" + 0.001*"que" + 0.001*"flight" + 0.001*"infect" + 0.001*"passeng" + 0.001*"airlin"
Topic: 4 Word: 0.002*"test" + 0.002*"patient" + 0.002*"symptom" + 0.002*"covid" + 0.002*"care" + 0.002*"i" + 0.002*"worker" + 0.002*"mask" + 0.002*"diseas" + 0.002*"infect"
Topic: 5 Word: 0.003*"i" + 0.003*"leagu" + 0.003*"uk" + 0.002*"newspap" + 0.002*"game" + 0.002*"player" + 0.002*"school" + 0.002*"club" 