In [1]:
import pandas as pd
import os
import spacy
import re

In [2]:
os.chdir('..')

In [3]:
BioTech_df = pd.read_csv('E:\\Zemoso Personal\\NLP_Final\\BioTech\\Biotech_Corpora.csv')

In [4]:
BioTech_df['Articles']=BioTech_df['Articles'].apply(str)

In [5]:
# Remove punctuation
BioTech_df['article_text_processed'] = \
BioTech_df['Articles'].map(lambda x: re.sub('[,\.!?]', '', x))

In [6]:
# Convert the titles to lowercase
BioTech_df['article_text_processed'] = \
BioTech_df['article_text_processed'].map(lambda x: x.lower())

In [7]:
BioTech_df['article_text_processed'].head()

0    with $122m startup idrx intends to stop cancer...
1    axcella makes progress against long covid as n...
2    abbvie and sosei lead slate of early august r&...
3    kazia’s star asset suffers surprise letdown in...
4    aaic: amyloid-beta oligomers an intranasal ant...
Name: article_text_processed, dtype: object

In [8]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','said','world','using','ask_hn','show_hn','new','first','high','could','development','company','patient','study','development','diseases','include','disease','cell','development','medicine','year','research','patient','include','disease','study','forward look', 'company','statement','treatment','cell','medicine','development','other','datum','vertex','research','therapy','clinical','mutation','potential','people','year','phase','age'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

In [10]:
data = BioTech_df.article_text_processed.values.tolist()
data_words = list(sent_to_words(data))

In [11]:
# remove stop words
data_words = remove_stopwords(data_words)

In [12]:
print(data_words[:1][0][:30])

['startup', 'idrx', 'intends', 'stop', 'cancer', 'mutations', 'tracks']


In [13]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(data)
print (lemmatized_texts[0][0:90])



startup idrx intend stop cancer mutation track


In [14]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[0][0:20])

['startup', 'idrx', 'intend', 'stop', 'cancer', 'mutation', 'track']


In [15]:
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=25)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams)



In [16]:
pip install --upgrade gensim




In [17]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel
from gensim import corpora

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)
low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]


In [18]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [19]:
test_doc = corpus[-1]

vector = lda_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.16641265), (1, 0.067168586), (2, 0.04866797), (3, 0.2048982), (4, 0.046087854), (5, 0.029047718), (6, 0.09645673), (7, 0.09652922), (8, 0.033004243), (9, 0.2117268)]
[(9, 0.2117268), (3, 0.2048982), (0, 0.16641265), (7, 0.09652922), (6, 0.09645673), (1, 0.067168586), (2, 0.04866797), (4, 0.046087854), (8, 0.033004243), (5, 0.029047718)]


In [20]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis

In [21]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
