## Fetches stock data from Yahoo Finance and builds dataset from the augmented tweet dataset

In [1]:
import json
import yfinance as yf
import pandas as pd

In [14]:
'''
Gets the stock data from Yahoo Finance for given tickerSymbol, startDate, endDate
'''
def getStockPriceDataframe(tickerSymbol, startDate, endDate):
  # fetch data
  data = yf.Ticker(tickerSymbol).history(start=startDate, end=endDate)

  # drop unnecessary cols
  data = data.drop('Dividends', axis=1)
  data = data.drop('Stock Splits', axis=1)

  # reindex
  data['Timestamp'] = data.index.array
  data.index = range(0, len(data))

  # calc derrived attributes
  data['Delta'] = data['Close'] - data['Open']
  data['Volatility'] = 0.0
  data['AvgVolatility'] = 0.0
  data['Momentum'] = 0.0

  for idx in range(1, len(data)):
    data.at[idx, 'Volatility'] = (data.at[idx - 1, 'Close'] - data.at[idx, 'Close']) / data.at[idx - 1, 'Close']
    data.at[idx, 'Momentum'] = 1.0 if (data.at[idx, 'Close'] > data.at[idx - 1, 'Close']) else 0.0
    if idx > 5:
      data.at[idx, 'AvgVolatility'] = data.iloc[idx-5:idx]['Volatility'].mean()
  
  return data

In [11]:
'''
Get the ticker symbols we're interested in
'''

stock_tickers = open('../data/stock-tickers.txt', 'r')
tickers = stock_tickers.readlines()

stock_tickers.close()

# Strips the newline character
for ticker in tickers:
    ticker = ticker.replace('\n', '').replace('$', '')
    df_prices = getStockPriceDataframe(tickerSymbol=ticker, startDate='2020-04-09', endDate='2020-07-16')
    ticker = ticker.lower()
    df_prices.to_csv('stock_prices_' + ticker + '.csv', sep=';', line_terminator='\n')



MSFT
AAPL
AMZN
META
BRK-B
GOOG
JNJ
JPM
V
PG
MA
INTC
UNH
BAC
T
HD
XOM
DIS
VZ
KO
MRK
CMCSA
CVX
PEP
PFE


In [27]:
'''
Calculate sentiment scores as p / (p + n)
'''

stock_tickers = open('../data/stock-tickers.txt', 'r')
tickers = stock_tickers.readlines()
stock_tickers.close()

data_by_ticker = {}

for ticker in tickers:
    ticker = ticker.replace('\n', '').replace('$', '').lower()
    new_feature_dict = {}
    with open("../data/regression-task/sentiments-by-date/" + ticker + "_sentiments.json") as sentf:
        sent_dict = json.load(sentf)
        for key, item in sent_dict.items():
            new_feature_dict[key] = item['p'] / (item['p'] + item['n'])
    data_by_ticker[ticker] = new_feature_dict

with open("../data/regression-task/new_sentiment_features.json", "w") as masterf:
    json.dump(data_by_ticker, masterf)

In [36]:
'''
Augments the dataset by grouping tweets by ticker symbol and date
'''

stock_tickers = open('../data/stock-tickers.txt', 'r')
tickers = stock_tickers.readlines()

stock_tickers.close()

for ticker in tickers:
    ticker = ticker.replace('\n', '').replace('$', '').lower()
    file_name = 'stock_prices_' + ticker + '.csv'
    df_prices = pd.read_csv("../data/regression-task/stock-price-data/" + file_name, sep=';', lineterminator='\n')
    new_features = []
    num_matches = 0
    sum = 0
    for i, row in df_prices.iterrows():
        date = row['Timestamp']
        f = ''
        if date in data_by_ticker[ticker]:
            num_matches += 1
            f = data_by_ticker[ticker][date]
            sum = sum + f
        new_features.append(f)
    avg_of_feature = sum/num_matches
    for i in range(len(new_features)):
        if new_features[i] == '':
            new_features[i] = avg_of_feature

    df_prices['positive_sentiment'] = new_features
    df_prices.to_csv('../data/regression-task/stock-price-data-augmented/stock_prices_' + ticker + '_augmented.csv', sep=';', line_terminator='\n')

In [41]:
'''
Finally write the augmented dataset
'''

import glob
import os

all_files = glob.glob(os.path.join("../data/regression-task/stock-price-data-augmented/", "*.csv"))
df = pd.concat((pd.read_csv(f, sep=';', lineterminator='\n') for f in all_files), ignore_index=True)
df.shape
df.to_csv('../data/regression-task/master-regression-dataset.csv', sep=';', line_terminator='\n')