In [1]:
import pandas as pd
import os
import spacy
import re

In [2]:
os.chdir('..')

In [3]:
EnTech_df = pd.read_csv("E:\\Zemoso Personal\\NLP_Final\\CleanTech\\NextEra.csv")

In [4]:
EnTech_df['Articles']=EnTech_df['Articles'].apply(str)

In [5]:
# Remove punctuation
EnTech_df['article_text_processed'] = \
EnTech_df['Articles'].map(lambda x: re.sub('[,\.!?]', '', x))

In [7]:
# Convert the titles to lowercase
EnTech_df['article_text_processed'] = \
EnTech_df['article_text_processed'].map(lambda x: x.lower())

In [8]:
EnTech_df['article_text_processed'].head()

0              e n v i r o n m e n t a l   s o c i a l
1              a n d  g o v e r n a n c e  r e p o r t
2    e n v i r o n m e n t a l   s o c i a l  a n d...
3                                        c on t en t s
4    cover photo: the wheatridge renewable energy f...
Name: article_text_processed, dtype: object

In [9]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','said','world','using','ask_hn','show_hn','new','first','high','could','development','company','patient','study','development','diseases','include','disease','cell','development','medicine','year','research','patient','include','disease','study','forward look', 'company','statement','treatment','cell','medicine','development','other','datum','vertex','research','therapy','clinical','mutation','potential','people','year','phase','age'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

In [11]:
data = EnTech_df.article_text_processed.values.tolist()
data_words = list(sent_to_words(data))

In [12]:
# remove stop words
data_words = remove_stopwords(data_words)

In [13]:
print(data_words[:1][0][:30])

[]


In [14]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(data)
print (lemmatized_texts[0][0:90])



n v r o n m n t l s c i l


In [15]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[0][0:20])

[]


In [16]:
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=25)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams)

[[], [], [], [], ['wheatridge', 'renewable', 'energy', 'facility', 'go', 'online'], ['combine', 'wind', 'solar', 'battery', 'energy'], ['storage', 'technology', 'location', 'wind', 'component', 'encompass'], ['megawatt', 'wind', 'capacity', 'produce', 'approximately', 'wind_turbine'], ['leave', 'catch', 'quick', 'glimpse', 'employee', 'appear'], ['solar_panel', 'equipment', 'check', 'blythe', 'mccoy', 'energy'], [], [], [], ['content'], ['vision_strategy', 'ision'], ['blueprint_reach_real', 'rom', 'ce'], ['report'], ['stakeholder'], ['confront_climate', 'change'], ['honor', 'professional', 'life', 'lead', 'good', 'team', 'industry'], ['team', 'clear', 'vision', 'help', 'lead', 'decarbonization', 'economy', 'vision'], ['bold', 'even', 'unprecedented', 'consistent', 'decade', 'long', 'record', 'performance'], ['portfolio', 'stakeholder', 'include', 'accomplishment', 'goal', 'relate', 'environmental'], ['social', 'governance', 'share', 'report'], ['build_lead_clean_energy', 'provider'], [

In [16]:
pip install --upgrade gensim




In [17]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel
from gensim import corpora

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)
low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

[]


In [18]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


In [19]:
test_doc = corpus[-1]

vector = lda_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.06363138), (1, 0.091126606), (2, 0.09129958), (3, 0.058276303), (4, 0.12281322), (5, 0.14445771), (6, 0.079572), (7, 0.13410304), (8, 0.109997585), (9, 0.104722574)]
[(5, 0.14445771), (7, 0.13410304), (4, 0.12281322), (8, 0.109997585), (9, 0.104722574), (2, 0.09129958), (1, 0.091126606), (6, 0.079572), (0, 0.06363138), (3, 0.058276303)]


In [20]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis

In [21]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
