In [2]:
# web scraping imports
from CNN import *
from Reuters import *
from SeekingAlpha import *

# install nltk, selenium, genism and bs4
# install chromedriver and add to path

# ntlk imports
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# prettyprint
import pprint

# genism imports
from gensim import corpora,models
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS

stopwords = []  # user defined stopwords

def process_article(text):
# for single article (text string), tokenize and lemmatize data, return list of word stems
    sents = sent_tokenize(text)
    sents = '. '.join([s.strip().replace("\n", "") for s in sents])
    words = [word for word in sents.lower().split()
             if word not in STOPWORDS and word.isalnum() and word not in stopwords]
    wordnet_lemmatizer = WordNetLemmatizer()
    words = [wordnet_lemmatizer.lemmatize(i) for i in words]
    p_stemmer = PorterStemmer()
    words = [p_stemmer.stem(i) for i in words]
    return words


def get_corpus(text):
# for single article (text string), tokenize data and return corpus (list of lists)
    a_list = [process_article(text)]
    dictionary = corpora.Dictionary(a_list)
    return [dictionary.doc2bow(a) for a in a_list]


def tf_idf(corpus):
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    return corpus_tfidf


def get_lda(news_list, num_topics, num_passes):
# for list of articles ([date,text string] list), generate lda model
    texts = [process_article(a[1]) for a in news_list]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(a) for a in texts]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lda = LdaModel(corpus_tfidf,  # list of lists containing tuples (word index, word freq)
                   id2word=dictionary,  # change nums back to words
                   num_topics=num_topics,  # need to set num topics
                   passes=num_passes)
    return lda,corpus_tfidf
 

def print_lda(lda, num_words=8):
# prints lda model coefficients, user can specify number of words to include for each topic
    pp = pprint.PrettyPrinter(indent=4)
    # create prettyprint obj, 8 words for each topic
    pp.pprint(lda.print_topics(num_words=8))


def get_topic(article_number):
# for a single article (text string), given the order of the article on the loaded news list, returns list of relevant topics ordered by likelihood
    from operator import itemgetter
    #single_corpus = get_corpus(article) 
#     lda.get_document_topics(new_a[0],minimum_probability=0.05,per_word_topics=False)
    return sorted(lda.get_document_topics(corpus_tfidf[article_number],minimum_probability=0, per_word_topics=False),
                  key=itemgetter(1), reverse=True)





In [3]:
import os
import datetime

def get_news(ticker,days):
# retreives news from websites, may take a few minutes
    get_cnn(ticker,days)
    get_reuters(ticker,days)
    get_seekingalpha(ticker,days)

def get_date(filename,current_dir):
# for given filename and relative directory, return date (datetime format)
    try:
        date = datetime.datetime.strptime(filename[:19],'%Y-%d-%m_%H.%M.%S')
        return date
    except:
        pass
    
def get_file(filename,current_dir):
# for given filename and relative directory, return file content (text string)
    date = get_date(filename,current_dir)
    if date:
        with open(current_dir+"/"+filename,"r") as f:
            text = f.read()
        return text

def load_news(ticker,days=3):
# for given ticker, loads news from database and returns a list of [date,articles]
# default parameter: load articles from recent three days
    l = []
    for news_source in os.listdir("news/"+ticker):
        current_dir = "news/"+ticker+"/"+news_source
        for doc in os.listdir(current_dir):
            date = get_date(doc,current_dir) 
            if date and date>=(datetime.datetime.now()-datetime.timedelta(days=days)):
                l.append([date,get_file(doc,current_dir)])
    return l

In [8]:
def load_dict():
    import os
    def get_dict_words(dict_dir):
        with open(dict_dir,'r') as f:
            words = []
            for line in f:
                words.append(line.replace('\n','').lower())
        return words
    
    neg = get_dict_words(os.getcwd() + '\\dictionaries\\negative.txt')
    pos = get_dict_words(os.getcwd() + '\\dictionaries\\positive.txt')
    return neg,pos

def emotion_analysis(text):
    neg,pos = load_dict()
    from nltk import word_tokenize
    results = dict()
    c1 = c2 = 0
    for word in word_tokenize(text):
        word = word.lower()
        if word in neg:
            c1 += 1
        if word in pos:
            c2 += 1
    n = len(word_tokenize(text))
    results['negative'] = c1/n
    results['positive'] = c2/n
    return results

def emotion_analyzer(text_list):
    import pandas as pd
    df = pd.DataFrame(columns=['negative','positive'])
    count = 1
    for text in text_list:
        a = emotion_analysis(text[1])
        df.loc[count] = [a['negative'],a['positive']]
        count += 1
    return df

In [6]:
get_news("fb",5)

In [5]:
news = load_news("fb")

# print lda model
lda,corpus_tfidf = get_lda(news, 2, 10)
print_lda(lda)

# get topic for single document
# a = "'A version of this article first appeared in the Reliable Sources newsletter. You can sign up for free right here.   This is an incredibly difficult time for Alex Trebek, his family members, and the extended Jeopardy! family that spans the globe. Trebek showed tremendous courage by recording a candid video message to fans about his stage 4 pancreatic cancer diagnosis. He even managed to work in a joke about being under contract for three more years. Trebek was diagnosed earlier this week, and his video was released on Wednesday afternoon.  In a time that is all about what is keeping us apart, we got tough news today about someone who has always brought America together, literally for decades, CNN\'s Chris Cuomo said Wednesday night. I don\'t care what your race, color, creed, gender, or bank account level, you\'ve watched Jeopardy. Since 1984 Alex Trebek has been the smartest guy in our living rooms, teaching us, but more importantly, bringing us together. Trebek\'s show puts facts first, Cuomo said, and we need him, now mo"
# print(get_topic(a))


[   (   0,
        '0.003*"data" + 0.003*"commiss" + 0.003*"european" + 0.003*"follow" + '
        '0.003*"9" + 0.003*"fb" + 0.003*"estim" + 0.003*"user"'),
    (   1,
        '0.003*"snapchat" + 0.002*"c" + 0.002*"qualcomm" + 0.002*"market" + '
        '0.002*"snap" + 0.002*"platform" + 0.002*"chip" + 0.002*"center"')]


In [9]:
emotion_analyzer(news)

Unnamed: 0,negative,positive
1,0.010417,0.019097
2,0.008392,0.008392
3,0.009636,0.014989
4,0.014286,0.008571
5,0.025,0.0
6,0.011905,0.0
