In [32]:
import requests
from stocks import *
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import string  
import csv
import re

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rohansuresh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# description:
#   function to get company name from ticker symbol using the yahoo finance api. 
#   https://stackoverflow.com/questions/38967533/retrieve-company-name-with-ticker-symbol-input-yahoo-or-google-api
# args:
#   symbol - ticker symbol for company
# return:
#   company_name - company name associated with ticker
def get_symbol(symbol):
    url = "http://d.yimg.com/autoc.finance.yahoo.com/autoc?query={}&region=1&lang=en".format(symbol)

    result = requests.get(url).json()

    for x in result['ResultSet']['Result']:
        if x['symbol'] == symbol:
            company_name = x['name']
            return company_name

In [3]:
# description:
#   function to remove stop words and other general words in company name to have more concise reference
#   to company
#   e.g. "The Coca-Cola Company" could also be simply referred to as "Coca-Cola"
# args:
#   company_name - Name of company we want to get permutations of
# return:
#   company_name_perm - Different permutations of company name
def get_cleaned_name(symbol):

    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(symbol) 
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words] 
    
    # words similar to organisation, inc, etc. gathered by going through DOW 30
    remove_words = ["company", "co", "co.", "inc", "inc.", "corporation", "group", "incorporated", "alliance"]
    filtered_sentence = [w for w in filtered_sentence if not w.lower() in remove_words] 
    
    # if the company name was "<some_name> & <word_in_remove_words>" we should remove the '&' symbol
#     if filtered_sentence[-1] == "&":
#         filtered_sentence = filtered_sentence[:-1]
        
    # create string from tokenized string
    output = ' '.join(filtered_sentence)
    output2 = ""
    
    # clean up punctuation formatting errors
    punc = ["'", '"', ',', '.', "&"]
    for i in range(len(output)):
        if output[i] not in punc:
            output2 += output[i]
        else:
            if output[i] == "'" or output[i] == '"':
                try:
                    output2 = output2[:-1]
                    output2 += output[i]
                except:
                    output2 = ""
            elif output[i] == '.' or output[i] == ',':
                try:
                    output2 = output2[:-1]
                except:
                    output2 = ""
            elif output[i] == '&' and i==len(output)-1:
                try:
                    output2 = output2[:-1]
                except:
                    output2 = output2
                    
    # clean up multiple white spaces
    output2= ' '.join(output2.split())

    return output2
    
tickers = get_dow_tickers()
for ticker in tickers:
    print("Ticker: " + ticker, end=" - ")
    company = get_symbol(ticker)
    print(company)
    print(get_cleaned_name(company))
    print(" ")

Ticker: VZ - Verizon Communications Inc.
Verizon Communications
 
Ticker: AXP - American Express Company
American Express
 
Ticker: CSCO - Cisco Systems, Inc.
Cisco Systems
 
Ticker: TRV - The Travelers Companies, Inc.
Travelers Companies
 
Ticker: MCD - McDonald's Corporation
McDonald's
 
Ticker: GS - The Goldman Sachs Group, Inc.
Goldman Sachs
 
Ticker: JNJ - Johnson & Johnson
Johnson Johnson
 
Ticker: MRK - Merck & Co., Inc.
Merck
 
Ticker: KO - The Coca-Cola Company
Coca-Cola
 
Ticker: DOW - Dow Inc.
Dow
 
Ticker: AMGN - Amgen Inc.
Amgen
 
Ticker: PG - The Procter & Gamble Company
Procter Gamble
 
Ticker: CAT - Caterpillar Inc.
Caterpillar
 
Ticker: HD - The Home Depot, Inc.
Home Depot
 
Ticker: INTC - Intel Corporation
Intel
 
Ticker: NKE - NIKE, Inc.
NIKE
 
Ticker: IBM - International Business Machines Corporation
International Business Machines
 
Ticker: CRM - salesforce.com, inc.
salesforccom
 
Ticker: CVX - Chevron Corporation
Chevron
 
Ticker: V - Visa Inc.
Visa
 
Ticker: HON

In [45]:
def read_csvs(folder_name):

    # create dictionary of guardian articles where key is date and value is the article
    guardian_dict = {}
    
    # boolean to skip the header
    not_header = False
    
    # create dictionary of months to easily get month number
    months_arr = ["ignore", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    months_dict = {}
    for i in range(len(months_arr)):
        months_dict[months_arr[i]] = i
        
    # open file
    with open(folder_name+'guardian_headlines.csv', newline='') as f:
        reader = csv.reader(f)
        # iterate over file
        for row in reader:
            if not_header:
                # split date into array using '-' as delimter
                date_arr = row[0].split('-')
                # dates formatted in two different ways
                # if formatted like so "day(num)-month(three letters)-year(last 2 digits)"
                # use the below code chunk to format date
                try:
                    month = date_arr[1]
                    month_num = months_dict[month]
                    date = "20" + str(date_arr[2]) + "-" + str(month_num)
                # else if date formatted like so "month(three letters)-year(last 2 digits)"
                # use below code chunk to format date
                except:
                    month = date_arr[0]
                    month_num = months_dict[month]
                    date = "20" + str(date_arr[1]) + "-" + str(month_num)
                    
                # put all articles onto one line
                article = row[1].replace("\n", " ")
                # remove excess whitespace
                article = re.sub(' +', ' ', article)
                
                try:
                    article_arr = guardian_dict[date]
                    article_arr.append(article)
                    guardian_dict[date] = article_arr
                
            else:
                not_header = True
               
    ################################################################################################
    
    #TODO: FIND HOW TO STANDARDIZE DATES - DATES IN CNBC CSV NOT STANDARDIZED
    
#     # create dictionary of cnbc articles where key is date and value is the article
#     cnbc_dict = {}
    
#     # boolean to skip the header
#     not_header = False
    
#     # open file
#     with open(folder_name+'cnbc_headlines.csv', newline='') as f:
#         reader = csv.reader(f)
#         # iterate over file
#         for row in reader:
#             if not_header:
#                 date = row[1].split(",")
#                 if date != ['']:
                    
#                     temp = date[1].split(" ")
# #                     print(temp)
#                     date = temp[3]+"-"+temp[2]
#                     cnbc_dict[date] = row[0]
#             else:
#                 not_header = True
                
    
#     return guardian_dict, cnbc_dict
    return guardian_dict

guardian_dict, cnbc_dict = read_csvs('786286_1351005_bundle_archive/')
for key in guardian_dict:
    print(key, end=" + ")
    print(guardian_dict[key])


2020-July + Cramer's earnings watch: 'If the banks get hammered, things could get ugly'
July-7 + Stay invested in US markets for the next two years: Strategist
July-6 + Cramer's lightning round: Buy Merck on its superior anti-cancer portfolio
July-2 + S&P hits record high after US–China trade truce—5 experts weigh in on what's next for markets
July-1 + Jim Cramer on how skeptical investors keep carrying the bull market higher
2020-June + Polaris has a 'once-in-a-generation opportunity,' Jim Cramer says
June-9 + Markets could go even higher in this liquidity-driven rally: Professor
June-8 + Cramer's game plan: Worry doesn't pay off in this market
June-4 + Cramer: It's not just FANG—here are the other tech stocks that pushed the Nasdaq to new highs
June-3 + Cramer: Trump's going after business, and that's bad for stocks
June-2 + Cramer's lightning round: 'If I really want solar, I want Tesla'
June-1 + Cramer's game plan: Take on more risk as the jobs boom boosts stocks
2020-May + Cramer'