In [1]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementNotInteractableException, StaleElementReferenceException

In [70]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\seant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\seant\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [60]:
def get_ngrams(article):
    vectorizer = CountVectorizer(ngram_range=(2, 3)).build_analyzer()
    return vectorizer(article, stop_words=stopwords.words('english'))

In [62]:
def process_word(word, pos):
    pos_dict = {'J':'a', 'V':'v', 'N':'n', 'R':'r'}
    if pos[0] in pos_dict:
        return lemmatizer.lemmatize(word.lower(), pos_dict[pos[0]])
    else:
        return lemmatizer.lemmatize(word.lower())

In [63]:
def lemmatize(article):
    words = re.findall("\w+", article)
    pos_tokens = nltk.pos_tag(words)
    return [process_word(*token) for token in pos_tokens if token[0].lower() not in stopwords.words('english')]

In [64]:
def lowercase(words):
    return [word.lower() for word in words]

In [65]:
def remove_stopwords(words):
    return [word for word in words if word not in stopwords.words('english')]

In [66]:
def clean_articles(article):
    ngrams = get_ngrams(article)
    words = lemmatize(article)
    words = lowercase(words)
    words = remove_stopwords(words)
    return words + ngrams

In [50]:
driver.get(url_lst[0])
body = driver.find_elements_by_class_name('body__content')
test_article = body[0].text

In [72]:
driver = webdriver.Chrome('C:/Users/seant/OneDrive/Desktop/chromedriver_win32/chromedriver.exe')

url_lst = [
    'https://www.nasdaq.com/press-release/amazon-launches-kindle-vella-serialized-stories-in-a-mobile-first-interactive-reading',
    'https://www.nasdaq.com/press-release/amazon-teams-up-with-keke-palmer-to-publish-new-short-story-series-based-on-her',
    'https://www.nasdaq.com/press-release/shop-now-save-now-amazons-holiday-dash-event-starts-today-with-black-friday-worthy',
    'https://www.nasdaq.com/press-release/amazon-announces-first-fulfillment-center-and-second-delivery-station-in-little-rock',
    'https://www.nasdaq.com/press-release/these-three-stocks-will-be-among-top-esports-winners-2019-04-21'
]

articles = {}

for url in url_lst:
    driver.get(url)
    body = driver.find_elements_by_class_name('body__content')
    header = driver.find_elements_by_class_name('press-release-header__content')
    cleaned_article = clean_articles(body[0].text)
    articles[header[0].text] = cleaned_article

driver.quit()

In [74]:
article_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in articles.items()]))

In [75]:
article_df.to_csv(path_or_buf='C:/Users/seant/stock_analyzer/capstone_3/data/cleaned_articles.csv')

In [44]:
article_df_copy = pd.read_csv('C:/Users/seant/stock_analyzer/capstone_3/data/cleaned_articles.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/seant/stock_analyzer/capstone_3/data/basic_article_csv.csv'

In [71]:
cleaned_test = clean_articles(test_article)
counts = Counter(cleaned_test).most_common()
counts

[('kindle', 29),
 ('story', 22),
 ('amazon', 20),
 ('vella', 18),
 ('kindle vella', 18),
 ('reader', 15),
 ('com', 13),
 ('author', 12),
 ('episode', 10),
 ('amazon com', 10),
 ('new', 8),
 ('publish', 8),
 ('publishing', 8),
 ('thousand', 6),
 ('available', 6),
 ('first', 5),
 ('time', 5),
 ('direct', 5),
 ('kindle direct', 5),
 ('direct publishing', 5),
 ('kindle direct publishing', 5),
 ('include', 4),
 ('io', 4),
 ('app', 4),
 ('today', 4),
 ('share', 4),
 ('say', 4),
 ('work', 4),
 ('earth', 4),
 ('kindle ios', 4),
 ('ios app', 4),
 ('vella stories', 4),
 ('kindle ios app', 4),
 ('kindle vella stories', 4),
 ('audrey', 3),
 ('carlan', 3),
 ('free', 3),
 ('via', 3),
 ('mobile', 3),
 ('experience', 3),
 ('u', 3),
 ('every', 3),
 ('use', 3),
 ('follow', 3),
 ('read', 3),
 ('write', 3),
 ('www', 3),
 ('kdp', 3),
 ('customer', 3),
 ('audrey carlan', 3),
 ('app amazon', 3),
 ('mobile first', 3),
 ('com kindle', 3),
 ('publishing kindle', 3),
 ('ios app amazon', 3),
 ('app amazon com', 3

In [78]:
article_df.iloc[:, 4]

0          houston
1               tx
2       accesswire
3         february
4               21
           ...    
6546           NaN
6547           NaN
6548           NaN
6549           NaN
6550           NaN
Name: These Three Stocks Will be Among the Top eSports Winners\nPUBLISHED\nFEB 21, 2019 9:20AM EST, Length: 6551, dtype: object