In [1]:
import pandas as pd
import os

In [2]:
os.chdir('..')

In [3]:
articles_df = pd.read_csv("E:\Zemoso Personal\ParseHub Datasets\Final_NLP_Dataset.csv")

In [4]:
import re

In [5]:
# Remove punctuation
articles_df['article_text_processed'] = \
articles_df['Articles'].map(lambda x: re.sub('[,\.!?]', '', x))

In [6]:
# Convert the titles to lowercase
articles_df['article_text_processed']  = \
articles_df['article_text_processed'].map(lambda x: x.lower())

In [7]:
articles_df['article_text_processed'].head()

0    mark mcqueen thinks it’s a great time to have ...
1    tech-enabled fulfillment company darwynn close...
2    introhive cuts 16 percent of workforce citing ...
3    blume secures $2 million cad to scale latte re...
4    why now is the time for canadian startups to h...
Name: article_text_processed, dtype: object

In [8]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','said','world','using','ask_hn','show_hn','new','first','high','could'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

In [10]:
data = articles_df.article_text_processed.values.tolist()
data_words = list(sent_to_words(data))

In [11]:
# remove stop words
data_words = remove_stopwords(data_words)

In [12]:
print(data_words[:1][0][:30])

['mark', 'mcqueen', 'thinks', 'great', 'time', 'chequebookwe', 'quite', 'vcs', 'guests', 'podcast', 'recently', 'reason', 'like', 'hearing', 'say', 'look', 'companies', 'today', 'reminded', 'like', 'bleak', 'summer', 'play', 'investor', 'entrepreneur', 'sometimes', 'create', 'perception', 'vcs', 'telling']


In [13]:
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams)



In [14]:
pip install --upgrade gensim

Note: you may need to restart the kernel to use updated packages.


In [15]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel
from gensim import corpora

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)
low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 3), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1)]


In [16]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=17,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [17]:
test_doc = corpus[-1]

vector = lda_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.035575423), (1, 0.026607439), (2, 0.09079938), (3, 0.017799096), (4, 0.047756318), (6, 0.15946287), (7, 0.015679521), (8, 0.04499635), (9, 0.16938733), (10, 0.023395274), (11, 0.0526391), (12, 0.18311574), (13, 0.021976337), (15, 0.013719969), (16, 0.08758917)]
[(12, 0.18311574), (9, 0.16938733), (6, 0.15946287), (2, 0.09079938), (16, 0.08758917), (11, 0.0526391), (4, 0.047756318), (8, 0.04499635), (0, 0.035575423), (1, 0.026607439), (10, 0.023395274), (13, 0.021976337), (3, 0.017799096), (7, 0.015679521), (15, 0.013719969)]


In [18]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis

In [19]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
