In [7]:
#Scraper packages
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

#Data Cleaning
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import datetime 
import itertools
import re

#Data Science
from textblob import TextBlob

#Text analysis
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.draw.dispersion import dispersion_plot
from nltk.collocations import *
from nltk.tokenize import sent_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
import pyLDAvis.gensim
import pyLDAvis # Visualize the topics
import gensim.corpora as corpora# Create Dictionary
from pprint import pprint# number of topics
from gensim.models.coherencemodel import CoherenceModel

#OS Packages
import os
from pathlib import Path
import pickle 
from distutils.version import LooseVersion
from collections import Counter

#Suppressing warnings over out of date packages
import warnings
warnings.filterwarnings("ignore")

#Wordcloud
from wordcloud import WordCloud# Join the different processed titles together.

In [5]:
#Modules I had to download onto my machine for this project
#!pip install textblob
#nltk.download('punkt')
#nltk.download('stopwords')
#!pip install pyLDAvis



[nltk_data] Downloading package punkt to /Users/ryanshen/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanshen/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting FuzzyTM>=0.4.0 (from gensim->pyLDAvis)
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim->pyLDAvis)
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.1/67.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting simpful (from pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis)
  Downloading simpful-2.11.0-py3-none-any.whl (32 kB)
Collecting fst-pso (from pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis)
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting miniful (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis)
  Downlo

<h1>House Representative Members: Data Scraping and Cleaning</h1>
<h3>As an option we can use this link to get environmental scorecard data #https://scorecard.lcv.org/members-of-congress</h3>

In [9]:
def get_house_representatives():
    
    def custom_date_parser(value): #This function is encapsulated because it's customized for this particular dataframe
        try:
            return pd.to_datetime(value, format='%Y')
        except:
            return pd.to_datetime(1900, format='%Y')

        driver = webdriver.Firefox() #Ensuring that we move the geckodriver to the user/bin prior to running
        driver.get('https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives')
        house_members = driver.find_element_by_xpath('//*[@id="votingmembers"]')
        rows = house_members.find_elements_by_tag_name("tr")
        member_info = []
        district_info = []
        for row in rows:
            info_cells = row.find_elements_by_tag_name("td")
            district_cells = row.find_elements_by_tag_name("th")
            for cell in info_cells:
                member_info.append(cell.text)
            for cell_d in district_cells:
                district_info.append(cell_d.text)
        #District namves have the same tag as a table header, so let's separate the table headers from the district names
        col_names, districts = district_info[0:8],district_info[8:]
        col_names[-1] = 'Birthdate'
        col_names[0:2] = col_names[0:2][::-1]
        #There will be some blanks because the district names go by different tags, so we have to populate the missing values in our list with the district names
        count = 0
        for i in range(0,len(member_info)):
            if member_info[i] == '':
                member_info[i] = districts[count]
                count += 1
            if member_info[i] == 'VACANT':
                member_info.remove('VACANT')
                member_info.append([districts[count]])
                for i in range(0,7):
                    member_info.append('No House Member')
                count += 1
        house_array = np.reshape(np.array(member_info),(int(len(member_info)/8),8)) #Reshaping the array from one long list into an 8 by 8 matrix
        house_df = pd.DataFrame(house_array,columns = col_names)
        #Splitting birthdate into age and DOB
        birthdate = house_df.Birthdate.apply(lambda x: str(x).split(' ('))
        date_format = '%B %d, %Y'
        birth = []
        ages = []
        for row in birthdate:
            try:
                birth.append(datetime.strptime(row[0], date_format))
                a = re.findall(r'\d*', row[1])
                age = ''.join(a)
                ages.append(int(age))
            except:
                birth.append(datetime.strptime('January 1, 1900', date_format))
                ages.append(0)

        house_df['Birthdate'] = pd.Series(birth)
        house_df['Age'] = pd.Series(ages)
        #Now let's clean the prior experience and education columns where we have new lines
        house_df['Prior experience'] = house_df['Prior experience'].apply(lambda x: x.replace('\n',' '))
        house_df['Education'] = house_df['Education'].apply(lambda x: x.replace('\n',' '))

        '''#Now we're going to turn the remainder of the variables into categorical variables
        try:
            cat = ['Member','District','Party']
            for col in cat:
                house_df[col] = pd.Categorical(house_df[col])
        except:
            pass
        '''
        house_df['Assumed office'] = house_df['Assumed office'].apply(custom_date_parser)
        driver.close()
        return house_df

In [None]:
#Optional link to League of Conservation Database and comparing indicators in their census tract

In [None]:
#Create a scraper that scans the headlines for EE News Articles, 
#Conduct LDA and topic modeling, and possibly connects the municipalities mentioned in the text to census facts or their congressmen’s action on energy justice issues
#Getting information regarding voting records

<h1> Scraping EE News for the past year </h1>

In [84]:
driver = webdriver.Firefox('/Users/ryanshen/Desktop/opt/anaconda3/bin/')

In [90]:
def article_get():
    heading_names = []
    
    url_list = [] #We will be collecting a list of urls
    author_dates = []
    for page in range(1,50):
        driver.get('https://www.eenews.net/publication/energywire/page/'+ str(page) +'/')
        headings = driver.find_elements_by_tag_name("h4") #first we take all the headings on the landing page
        for heading in headings:
            heading_names.append(heading.text)
        urls = driver.find_elements_by_tag_name("a")
        for url in urls: #then we grab all the urls
            if url.text == 'Read More >>': #Read More >> tagline is necessary to prevent duplication of URLs since every article on the webpage has two hyperlinks 
                url_list.append(url.get_attribute('href')) #Add the hyperlink to a list
        author_date = driver.find_elements_by_tag_name('p')
        for entry in author_date:
            if entry.text[0:2] == 'BY': #All author date combos are denoted with the keyword BY
                author_dates.append(entry.text)
    heading_names = list(set(heading_names)) #Dropping all h4 headings that have blank values
    author_and_dates = author_dates
    authors = [author.split('|')[0][3:-1] for author in author_and_dates]
    dates = [date.split('|')[1][1:-4] for date in author_and_dates]
    
    def str_split(string):
        article = string.split(' ')[0]
        date_format = '%m/%d/%Y'
        return datetime.strptime(article, date_format)
        
    pub_date = list(map(str_split, dates))
    rows = []
    for i in range(0,len(url_list)):
        rows.append([heading_names[i], url_list[i], authors[i], pub_date[i]])
    articles = pd.DataFrame(rows, columns = ['headings', 'url', 'author', 'pub_date'])
    return articles
    #Create a function that takes the text from every URL

In [91]:
ee_news = article_get()

443 443 444 444
['Oil crackdown emerges as winning strategy for Colo. governor', 'Deep freeze and data concerns test Southeast power market', 'Senate climate deal: Boost or barrier for EVs?', 'EPA rule nods to public demand for EVs', '4 energy issues to watch with EPA’s power plant rule', 'People to watch at DOE, Interior, FERC', 'Miles apart: U.S. and Europe diverge on Chinese EVs', 'A blue state asks: Is carbon capture part of climate agenda?', 'Can N.M. build world’s largest coal CCS project?', 'D.C. Circuit rejects NEPA challenge to Va. pipeline expansion', 'States and clean energy: 3 issues to watch', 'FERC approves power plant rules to fight extreme weather', 'Biden admin defends solar probe, spurs industry outrage', 'White House issues ‘action plan’ to speed up energy reviews', 'Interior weighs economics of oil vs. climate in 5-year plan', 'Is Biden’s 2035 CO2 goal still achievable? What studies say', 'Can the Northeast slash carbon and keep the lights on?', 'Oil showdown: 3 way

In [92]:
print(ee_news.dtypes)

headings            object
url                 object
author              object
pub_date    datetime64[ns]
dtype: object


In [93]:
def get_text(url):
        paragraph = ''
        driver.get(url)
        sent_elem = driver.find_elements_by_tag_name('p')#getting the sentence elements off the page
        sent_elem = sent_elem[1::]
        for sentences in sent_elem:
            paragraph += sentences.text
        return paragraph
ee_news['text'] = ee_news['url'].apply(lambda x: get_text(x))

In [94]:
#Now we want to convert all cases to lowercase and remove punctuation prior to tokenization
ee_news.text = ee_news['text'].map(lambda x: re.sub('[,\.!?]', '', x)) 
ee_news.text = ee_news['text'].map(lambda x: x.lower())

driver.close() #We've scraped all the text we've needed

In [95]:
ee_news.to_pickle('ee_news.pkl') #Converting to pickle so we don't have to scrape again

<h1> Performing Topic Modeling on EE News Text </h1>

In [None]:
#If reading this from pickle, run this cell
ee_news = pd.read_pickle('/Users/ryanshen/Desktop/Projects/House_Voting/House-Voting/ee_news.pkl')

In [None]:
ee_news.head()
print(len(ee_news))

In [None]:
#Need to run a generator if trying to get author counts over time

<h4> We're now going to prime our dataframe for text analysis using <a href = https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0>LDA </a> </h4>

In [None]:
#Idea: Create heading clusters to see what the most popular topics are. Google topic modeling
stop_words = stopwords.words('english') #First we load the english stopwords
stop_words.extend(['from','by', 'subject', 'politico', 'photos', '/', 'said', 'would','could']) #Here's where we have the option to remove stop words

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

data = ee_news.text.values.tolist()
data_words = list(sent_to_words(data))
data_words = remove_stopwords(data_words)

print(data_words[:1][0][:30])

In [None]:
ee_news['tokenized_text'] = data_words

In [None]:
id2word = corpora.Dictionary(data_words)# Create Corpus
texts = data_words# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]# View
print(corpus[:1][0][:30])

In [None]:
num_topics = 10# Build LDA model. We might need to experiment to see what the ideal number of topics is
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

<h4>Now we're going to determine the optimal number of topics by maximizing topic coherence.</h4>

<p>Here's a <a href = 'https://radimrehurek.com/gensim/models/coherencemodel.html'> reference </a> to the gensim api, and here's a <a href = https://tedboy.github.io/nlps/_modules/gensim/models/coherencemodel.html> link </a> to the source code. <a href = https://towardsdatascience.com/understanding-topic-coherence-measures-4aa41339634c>Here's</a> a descriptive explanation of how topic modeling works.</p>

<p>If you're not familiar with how topic modeling works, after confirming the number of topics, we examine our tokens to see if any of them have a high probability of being identified as a group across the entire corpus. (While topic modeling rarely examines relationships between more than two words, it is possible for three or more words to be identified as a relevant group.)

The probability of two or more words being associated is also known as a confirmation measure. 
    
After taking all the confirmation measures between each of the most important words in the topic, the confirmation measures are aggregated. The aggregate confirmation measure is the coherence score for the topic.
</p>


In [None]:
num_topics = range(2,30) #Trying from 2 to 30 clusters
coherence_scores = []

for n in num_topics:
    lda_model_test = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=n)
    cm = CoherenceModel(model=lda_model_test, corpus=corpus, coherence='u_mass') #u_mass is the fastest of the options to place within the coherence argument
    coherence = cm.get_coherence()  # get coherence value
    coherence_scores.append(coherence)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(2,30), coherence_scores, marker='o')
plt.axvline(x=2+coherence_scores.index(np.max(coherence_scores)), color='red', linestyle='--')
plt.title('Coherence Scores for LDA')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Scores')
plt.grid(True)
plt.show()

In [None]:
num_topics = coherence_scores.index(np.max(coherence_scores)) + 2
print('The ideal number of topics is', num_topics)

In [None]:
other_corpus

In [None]:
pyLDAvis.enable_notebook()

#We want to save the path as an absolute path when we initiate deserialization to lower the odds that the program will raise an error if the path name is too long/gets corrupted
LDAvis_data_filepath = Path('energy_news_scraper').absolute()
#LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))
os.makedirs(os.path.dirname(LDAvis_data_filepath), exist_ok=True)
print(type(str(LDAvis_data_filepath)))

In [None]:
dict_tokens = id2word.token2id #Presents a dictinoary of the tokens with their token_ids
token_freq = id2word.dfs #gets the frequency of different tokens within total documents, but this only seems to measure how many articles have the token rather than how many times these tokens are referenced

merged = {}
for key, count in dict_tokens.items():
    merged.update({key:token_freq[count]})
    

In [None]:
sorted_dict = sorted(merged.items(), key = lambda x:x[1], reverse = True)

In [None]:
sorted_dict

In [None]:
# # if you want to execute visualization prep yourself
if 1 == 1:
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)# Print the Keyword in the 3 topics
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) #--> Transforms and prepares data for a LDA transformation
    with open(str(LDAvis_data_filepath), 'wb') as f: #We serialize our data to improve performance and data integrity
        pickle.dump(LDAvis_prepared, f)# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

Unsurprisingly, each article has a high chance of including words like power, energy, gas, and broad terms related to American, domestic energy policy. More exclusive words can be found the further you reduce lambda. Based on what I've found by setting lambda = 0.2 and 0.08, the topic numbers can be grouped as follows:
1. Riskier Renewable Projects that include Offshore Wind and Hydrogen 
2. Solar Power Generation
3. State Energy Policy concerning EVs

In [None]:
#Train an ensemble LDA model to see if there's better performance
#Source: https://radimrehurek.com/gensim/models/ensemblelda.html

In [None]:
#Identify topic areas of prime interest and develop a time series of certain topic areas
#Create a lexical dispersion plot
#Source: https://github.com/katreparitosh/Discourse-Analytics-of-Political-Speech-Transcripts

In [None]:
#Learn about the collections API https://docs.python.org/3/library/collections.html

In [None]:
#Let's see if we can generate a time series of how often hydrogen is mentinoed in ee_news

In [None]:
def create_new_count(*argv):
    df_all_keywords = pd.DataFrame(ee_news.pub_date.unique(), columns = ['pub_date'])
    def col_counter(list_of_words, search_word):
        counter = Counter(list_of_words)
        return counter[search_word]
    
    for keyword in argv:
        ee_news[keyword] = [col_counter(ee_news['tokenized_text'][i], keyword) 
                               for i in range(len(ee_news['tokenized_text']))]
        filter_df = ee_news[['pub_date', keyword]].groupby('pub_date').sum().reset_index()
        df_all_keywords[keyword] = filter_df[keyword]
        
    
    def unlimited_stripplot(dataframe, *column_names, **kwargs):
        # Create a copy of the DataFrame with the selected columns
        selected_data = dataframe[list(column_names)]

        # Reshape the DataFrame for plotting
        melted_df = selected_data.melt(var_name='Key Words', value_name='Frequency')

        # Create a strip plot using seaborn
        sns.stripplot(x='Frequency', y='Key Words', data=melted_df, jitter=True, **kwargs)

        plt.xlabel('Frequency')
        plt.ylabel('Key Words')
        plt.title('Articles that mention Topics')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    
    unlimited_stripplot(ee_news, *argv)

In [None]:
create_new_count('solar', 'hydrogen', 'wind') #A latency plot will probably be better here

Here we're creating a lexical dispersion plot, which aims to visualize how often words appear from the begnning of the corpus. 

In [None]:
text = list(itertools.chain(*ee_news.tokenized_text))
all_articles_Text = nltk.Text(text)

In [None]:
all_articles_Text[0:10]

In [None]:
all_articles_Text.similar('energy')

In [None]:
all_articles_Text.similar('')

In [None]:
all_articles_Text.generate()

In [None]:
100 * all_articles_Text.count('energy') / len(all_articles_Text) #Energy takes up 1.2% of total words used in the corpora

In [None]:
#Getting lexical richness
len(set(all_articles_Text))/len(all_articles_Text)

In [None]:
'''target_words = ['money','energy']

all_articles_Text.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])
#Why is this lexical dispersion plot not graphing any results?
'''

In [None]:
all_articles_Text.collocations(num = 25)

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_articles_Text, window_size=7)
finder.nbest(bigram_measures.pmi, 10)
finder.apply_freq_filter(10)
results=finder.nbest(bigram_measures.pmi,100)

In [None]:
results #We're getting quite a few names 

In [None]:
#Option, turn every article into its own class
#Inspiration: https://www.geeksforgeeks.org/self-in-python-class/#