In [49]:
import requests
from stocks import *
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import string  
import csv
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
nltk.download('vader_lexicon')
from pprint import pprint

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rohansuresh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/rohansuresh/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [50]:
# description:
#   function to get company name from ticker symbol using the yahoo finance api. 
#   https://stackoverflow.com/questions/38967533/retrieve-company-name-with-ticker-symbol-input-yahoo-or-google-api
# args:
#   symbol - ticker symbol for company
# return:
#   company_name - company name associated with ticker
def get_company_name(symbol):
    url = "http://d.yimg.com/autoc.finance.yahoo.com/autoc?query={}&region=1&lang=en".format(symbol)

    result = requests.get(url).json()

    for x in result['ResultSet']['Result']:
        if x['symbol'] == symbol:
            company_name = x['name']
            return company_name

In [51]:
# description:
#   function to remove stop words and other general words in company name to have more concise reference
#   to company
#   e.g. "The Coca-Cola Company" could also be simply referred to as "Coca-Cola"
# args:
#   company_name - Name of company we want to get permutations of
# return:
#   company_name_perm - Different permutations of company name
def get_cleaned_name(symbol):

    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(symbol) 
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words] 
    
    # words similar to organisation, inc, etc. gathered by going through DOW 30
    remove_words = ["company", "co", "co.", "inc", "inc.", "corporation", "group", "incorporated", "alliance"]
    filtered_sentence = [w for w in filtered_sentence if not w.lower() in remove_words] 
    
    # if the company name was "<some_name> & <word_in_remove_words>" we should remove the '&' symbol
#     if filtered_sentence[-1] == "&":
#         filtered_sentence = filtered_sentence[:-1]
        
    # create string from tokenized string
    output = ' '.join(filtered_sentence)
    output2 = ""
    
    # clean up punctuation formatting errors
    punc = ["'", '"', ',', '.', "&"]
    for i in range(len(output)):
        if output[i] not in punc:
            output2 += output[i]
        else:
            if output[i] == "'" or output[i] == '"':
                try:
                    output2 = output2[:-1]
                    output2 += output[i]
                except:
                    output2 = ""
            elif output[i] == '.' or output[i] == ',':
                try:
                    output2 = output2[:-1]
                except:
                    output2 = ""
            elif output[i] == '&' and i==len(output)-1:
                try:
                    output2 = output2[:-1]
                except:
                    output2 = output2
                    
    # clean up multiple white spaces
    output2= ' '.join(output2.split())

    return output2
    
tickers = get_dow_tickers()
for ticker in tickers:
    print("Ticker: " + ticker, end=" - ")
    company = get_symbol(ticker)
    print(company)
    print(get_cleaned_name(company))
    print(" ")

Ticker: VZ - Verizon Communications Inc.
Verizon Communications
 
Ticker: CSCO - Cisco Systems, Inc.
Cisco Systems
 
Ticker: MCD - McDonald's Corporation
McDonald's
 
Ticker: AMGN - Amgen Inc.
Amgen
 
Ticker: TRV - The Travelers Companies, Inc.
Travelers Companies
 
Ticker: JNJ - Johnson & Johnson
Johnson Johnson
 
Ticker: PG - The Procter & Gamble Company
Procter Gamble
 
Ticker: NKE - NIKE, Inc.
NIKE
 
Ticker: HD - The Home Depot, Inc.
Home Depot
 
Ticker: MRK - Merck & Co., Inc.
Merck
 
Ticker: CRM - salesforce.com, inc.
salesforccom
 
Ticker: UNH - UnitedHealth Group Incorporated
UnitedHealth
 
Ticker: KO - The Coca-Cola Company
Coca-Cola
 
Ticker: GS - The Goldman Sachs Group, Inc.
Goldman Sachs
 
Ticker: CAT - Caterpillar Inc.
Caterpillar
 
Ticker: AXP - American Express Company
American Express
 
Ticker: INTC - Intel Corporation
Intel
 
Ticker: WMT - Walmart Inc.
Walmart
 
Ticker: IBM - International Business Machines Corporation
International Business Machines
 
Ticker: MSFT - 

In [52]:
# description:
#   function to remove stop words and other general words in company name to have more concise reference
#   to company
#   e.g. "The Coca-Cola Company" could also be simply referred to as "Coca-Cola"
# args:
#   company_name - Name of company we want to get permutations of
# return:
#   company_name_perm - Different permutations of company name
# https://www.kaggle.com/notlucasp/financial-news-headlines
def read_csvs(folder_name):

    # create dictionary of guardian articles where key is date and value is the article
    guardian_dict = {}
    
    # boolean to skip the header
    not_header = False
    
    # create dictionary of months to easily get month number
    months_arr = ["ignore", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    months_dict = {}
    for i in range(len(months_arr)):
        months_dict[months_arr[i]] = i
        
    # open file
    with open(folder_name+'guardian_headlines.csv', newline='') as f:
        reader = csv.reader(f)
        # iterate over file
        for row in reader:
            if not_header:
                # split date into array using '-' as delimter
                date_arr = row[0].split('-')
                # dates formatted in two different ways
                # if formatted like so "day(num)-month(three letters)-year(last 2 digits)"
                # use the below code chunk to format date
                try:
                    month = date_arr[1]
                    month_num = months_dict[month]
                    date = "20" + str(date_arr[2]) + "-" + str(month_num)
                # else if date formatted like so "month(three letters)-year(last 2 digits)"
                # use below code chunk to format date
                except:
                    month = date_arr[0]
                    month_num = months_dict[month]
                    date = "20" + str(date_arr[1]) + "-" + str(month_num)
                    
                # put all articles onto one line
                article = row[1].replace("\n", " ")
                # remove excess whitespace
                article = re.sub(' +', ' ', article)
                
                try:
                    article_arr = guardian_dict[date]
                    article_arr.append(article)
                    guardian_dict[date] = article_arr
                except:
                    guardian_dict[date] = [article]
                
            else:
                not_header = True
               
    ################################################################################################
    
    #TODO: FIND HOW TO STANDARDIZE DATES - DATES IN CNBC CSV NOT STANDARDIZED
    
#     # create dictionary of cnbc articles where key is date and value is the article
#     cnbc_dict = {}
    
#     # boolean to skip the header
#     not_header = False
    
#     # open file
#     with open(folder_name+'cnbc_headlines.csv', newline='') as f:
#         reader = csv.reader(f)
#         # iterate over file
#         for row in reader:
#             if not_header:
#                 date = row[1].split(",")
#                 if date != ['']:
                    
#                     temp = date[1].split(" ")
# #                     print(temp)
#                     date = temp[3]+"-"+temp[2]
#                     cnbc_dict[date] = row[0]
#             else:
#                 not_header = True
                
    
#     return guardian_dict, cnbc_dict
    return guardian_dict

guardian_dict = read_csvs('786286_1351005_bundle_archive/')
get_time_intervals("2017-01", "2017-05", 1)

[('2017-1-1', '2017-2-1'),
 ('2017-2-1', '2017-3-1'),
 ('2017-3-1', '2017-4-1'),
 ('2017-4-1', '2017-5-1')]

In [67]:
def semantic_analysis_guardian(headlines_dict, start, end, interval, ticker):
    
    # get time intervals formatted correctly
    time_intervals = get_time_intervals(start, end, interval)
    for i in range(len(time_intervals)): 
        time_intervals[i] = time_intervals[i][0][:-2]

    # get company name
    company_name = get_company_name(ticker)
    company_name = get_cleaned_name(company_name)
    print(company_name)
    
    # get articles relating to company
    relevant_headlines = {}
    for val in time_intervals:
        headlines = guardian_dict[val]
        relevant = []
        for headline in headlines:
            if company_name in headline.split(" "):
                relevant.append(headline)
        relevant_headlines[val] = relevant
        print(relevant)
        print(" ")
        
    # find seniment of new headlines for each time interval
    sentiment_dict = {}
    for val in time_intervals:
        headlines = relevant_headlines[val]
        sentiment = 0
        # Sentiment Intensity Analyzer
        sia = SIA()
        results = []
        for line in headlines:
            pol_score = sia.polarity_scores(line)
            pol_score['headline'] = line
            results.append(pol_score)
        for result in results:
            # using this heuristic to calculate score:
            # sentiment = (pos*1 + neg*-1)*neu
            sentiment += (result['pos']*1 + result['neg']*-1)*result['neu']
        print("Score is: " + str(sentiment))
        sentiment_dict[val] = sentiment
        
            
    
guardian_dict = read_csvs('786286_1351005_bundle_archive/')
semantic_analysis_guardian(guardian_dict, "2018-01", "2018-05", 1, "AAPL")

Apple
['Qualcomm fined €997m by EU for paying Apple to exclusively use its chips', 'Apple says it will pay $38bn in foreign cash taxes and create 20,000 US jobs', "Apple leads race to become world's first $1tn company"]
 
["Apple to launch 'technology enabled' healthcare service", 'Tax Amazon, Facebook and Apple more for UK universal pay – study']
 
['Apple to buy ‘Netflix for magazines’ Texture']
 
['Ireland expects Apple EU tax appeal to be heard in autumn', "Apple poaches Google's AI chief in push to save Siri", 'Chips are down: Apple to stop using Intel processors in Macs, reports say']
 
Score is: 0.033096
Score is: 0.017856000000000004
Score is: 0.0
Score is: 0.06238099999999999


In [47]:
headlines = guardian_dict["2018-1"]
# Sentiment Intensity Analyzer
sia = SIA()
results = []

for line in headlines:
    pol_score = sia.polarity_scores(line)
    pol_score['headline'] = line
    results.append(pol_score)

print(results[0])

{'neg': 0.0, 'neu': 0.735, 'pos': 0.265, 'compound': 0.5574, 'headline': "Inclusion on Forbes rich list 'now seen as toxic' by Russians"}
