In [1]:
import pandas as pd
import numpy as np

import yfinance as yf
import finnhub

import time
from datetime import datetime as dt, timedelta
from dateutil.relativedelta import relativedelta

import json
import requests

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [3]:
from pycaret.regression import *

# Read in the Tickers

In [4]:
stocks = pd.read_excel('Data/universe.xlsx')['symbol']

# Fetching the Data

### Stock Prices

In [5]:
# Convert pd Series of strings into one long string (that the format yf wants)
string_format_stocks = stocks.str.cat(sep=' ')

# Fetch the price data
prices = yf.download(string_format_stocks, start='2021-01-01', end=dt.today().strftime('%Y-%m-%d'))

# Extract only the adj close data
adj_close = prices['Adj Close']
volume = prices['Volume']

[*********************100%***********************]  110 of 110 completed


In [96]:
### TEMPORARYLY USE ONLY GOOG DATA ###

GOOG_price = adj_close['GOOG'].to_frame()
GOOG_price['volume'] = volume['GOOG']
GOOG_price.reset_index(inplace=True)
GOOG_price['Date'] = GOOG_price['Date'].astype(str)

GOOG_price.rename(columns={'GOOG': 'adj_close'}, inplace=True)
GOOG_price.head()

Unnamed: 0,Date,adj_close,volume
0,2021-01-01,,
1,2021-01-02,,
2,2021-01-03,,
3,2021-01-04,86.412003,38038000.0
4,2021-01-05,87.045998,22906000.0


### Industry Breakdown

In [15]:
# Initialize finhub client
finnhub_client = finnhub.Client(api_key="ccn3d6iad3i1nkrepma0ccn3d6iad3i1nkrepmag")

In [16]:
# Fetch the company data from finhub
industries = []

for count, ticker in enumerate(stocks):

    try:
        tickerdata = finnhub_client.company_profile2(symbol=ticker)
        industries.append(tickerdata['finnhubIndustry'])
    except Exception:
        industries.append(np.nan)

In [17]:
# Count the number of occurences of each industry
industries = np.array(industries)
unique, counts = np.unique(industries, return_counts=True)

# Calculate the precentage of industries
percentages = np.round(counts/np.sum(counts) * 100, 2)
print('Breakdown by Indsutry:')
print()
industries_dict = dict(zip(unique, percentages))
display(pd.DataFrame.from_dict(industries_dict, orient='index', columns=['Percentage of Total']))

Breakdown by Indsutry:



Unnamed: 0,Percentage of Total
Banking,0.91
Beverages,0.91
Biotechnology,0.91
Building,1.82
Chemicals,0.91
Commercial Services & Supplies,1.82
Consumer products,0.91
Distributors,0.91
Electrical Equipment,0.91
Energy,1.82


### Read in Previous Week Data

In [18]:
prev_general_market_news = pd.read_csv('Data/general_market_news.csv', index_col=0)
prev_financial_news = pd.read_csv('Data/financial_news.csv', index_col=0)
prev_ticker_news = pd.read_csv('Data/ticker_news.csv', index_col=0)

In [19]:
# Get the latest date from each file to start fetching new news articles from that date
general_market_news_latest_date = pd.to_datetime(prev_general_market_news['Date']).max().date()
financial_news_latest_date = pd.to_datetime(prev_financial_news['Date']).max().date()
ticker_news_latest_date = pd.to_datetime(prev_ticker_news['Date']).max().date()

# Convert datetime.date to datetime.datetime
general_market_news_latest_date = dt.combine(general_market_news_latest_date, dt.min.time())
financial_news_latest_date = dt.combine(financial_news_latest_date, dt.min.time())
ticker_news_latest_date = dt.combine(ticker_news_latest_date, dt.min.time())

# Fetch the data until this day
END_DATE_general = general_market_news_latest_date
END_DATE_financial = financial_news_latest_date
END_DATE_ticker = ticker_news_latest_date

### Market News

In [20]:
market_news = finnhub_client.general_news('general', min_id=0)

In [21]:
# Parse the news articles
parsed_market_news = []

for article in market_news:
    headline = article['headline']
    summary = article['summary']
    date = dt.fromtimestamp(article['datetime'])
    
    parsed_market_news.append([date, headline, summary])

In [74]:
# Convert to DataFrame
columns = ['Date', 'Headline', 'Summary']
general_market_news = pd.DataFrame(parsed_market_news, columns=columns)

## Alpha Vantage

In [23]:
ALPHA_VANTAGE_KEY = '4GIL4K9ZN1NWE26G'

In [24]:
# # Federal Funds Interest Rate

# url = 'https://www.alphavantage.co/query?function=FEDERAL_FUNDS_RATE&interval=daily&apikey=' + ALPHA_VANTAGE_KEY
# r = requests.get(url)
# data = r.json()

# # Maybe will use later

In [25]:
# Create a list of dates with a weekly frequency
dates_timestamps = pd.date_range(start=END_DATE_financial, end=dt.today(), freq='W').tolist()
dates = list(map(dt.date, dates_timestamps))

In [26]:
# Helper function that extracts needed info from json object more efficiently
def extract_info_vantage(article):
    time_published = dt.strptime(article['time_published'][:8], '%Y%m%d')
    sentiment_score = article['overall_sentiment_score']
    sentiment_label = article['overall_sentiment_label']
    return [time_published, sentiment_score, sentiment_label]

# This is the response from the API when no articles are found
empty_response = {'Information': 'No articles found. Please adjust the time range or refer to the API documentation https://www.alphavantage.co/documentation#newsapi and try again.'}

In [27]:
# Fetch the following news

financial_market_news = []
economy_fiscal_news = []
economy_monetary_news = []
economy_macro_news = []

time_from = dates[0].strftime('%Y%m%d') + 'T0000' # start with the firsst date in the list

for date in dates[1:]:
    
    time_to = date.strftime('%Y%m%d') + 'T0000' # update the time_to

    # financial market news

    topic = 'financial_markets'

    url = 'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&topics=' + topic \
            + '&time_from=' + time_from + '&time_to=' + time_to + '&limit=200&apikey=' + ALPHA_VANTAGE_KEY 
    result = requests.get(url)
    data = result.json()

    if data != empty_response:
        financial_market_news.extend(list(map(extract_info_vantage, data['feed']))) # add data to the list

    # Economy - Fiscal Policy news economy_fiscal

    topic = 'economy_fiscal'

    url = 'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&topics=' + topic \
            + '&time_from=' + time_from + '&time_to=' + time_to + '&limit=200&apikey=' + ALPHA_VANTAGE_KEY 
    result = requests.get(url)
    data = result.json()

    if data != empty_response:
        economy_fiscal_news.extend(list(map(extract_info_vantage, data['feed']))) # add data to the list

    # Economy - Monetary Policy

    topic = 'economy_monetary'

    url = 'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&topics=' + topic \
            + '&time_from=' + time_from + '&time_to=' + time_to + '&limit=200&apikey=' + ALPHA_VANTAGE_KEY 
    result = requests.get(url)
    data = result.json()

    if data != empty_response:
        economy_monetary_news.extend(list(map(extract_info_vantage, data['feed']))) # add data to the list

    # Economy - Macro/Overall

    topic = 'economy_macro'

    url = 'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&topics=' + topic \
            + '&time_from=' + time_from + '&time_to=' + time_to + '&limit=200&apikey=' + ALPHA_VANTAGE_KEY 
    result = requests.get(url)
    data = result.json()

    if data != empty_response:
        economy_macro_news.extend(list(map(extract_info_vantage, data['feed']))) # add data to the list
    
    # now the time_from is the previous time to, so looking at the following week articles
    time_from = time_to 
    
    # unpaid subscription allows 5 calls per minute
    time.sleep(60)

In [75]:
# Convert lists of data to DataFrame objects
columns = ['Date', 'Sentiment Score', 'Sentiment Label']

financial_market_news_df = pd.DataFrame(financial_market_news, columns=columns)
financial_market_news_df['Topic'] = 'financial_market_news'                       # specify the topic of the df

economy_fiscal_news_df = pd.DataFrame(economy_fiscal_news, columns=columns)
economy_fiscal_news_df['Topic'] = 'economy_fiscal_news'                           # specify the topic of the df

economy_monetary_news_df = pd.DataFrame(economy_monetary_news, columns=columns)
economy_monetary_news_df['Topic'] = 'economy_monetary_news'                       # specify the topic of the df

economy_macro_news_df = pd.DataFrame(economy_macro_news, columns=columns)
economy_macro_news_df['Topic'] = 'economy_macro_news'                             # specify the topic of the df

# Concatenate these dfs into one long df
financial_news = pd.concat([financial_market_news_df, economy_fiscal_news_df, economy_monetary_news_df, economy_macro_news_df])

# Reset index (otherwise each index is repeated 4 times)
financial_news.reset_index(drop=True, inplace=True)

## Polygon.io

In [29]:
POLYGON_KEY = 'Chz4dhUuzmumD5YcQeSpI7M_JgItlGJc'

In [30]:
# Helper function that gets the dates of fetched news articles
def get_dates(dic):
    return dic['published_utc'][:10]

# Helper function that extracts needed info from json object more efficiently
def extract_info_polygon(article):
    time_published = article['published_utc'][:10]
    title = article['title']
    try:
        description = article['description']
    except:
        description = np.nan        
    return [time_published, title, description]

In [32]:
# Fetch news per ticker
request_counter = 0
news = {}

for num, ticker in enumerate(stocks):
    
    print(ticker, num)
    
    TICKER = ticker
    DATE = dt.today().strftime('%Y-%m-%d')
    
    news[TICKER] = []
    
    while dt.strptime(DATE, '%Y-%m-%d') > END_DATE_ticker:
        
        print('running for date:', DATE)
                        
        # Unpaid subscription allows 5 calls per minute
        if request_counter == 5:
            time.sleep(60)
            request_counter = 0
        
        # Fetch the news articles
        result = requests.get('https://api.polygon.io/v2/reference/news?order=desc&ticker=' + TICKER + 
                              '&published_utc.lte=' + DATE + '&limit=1000&apiKey=' + POLYGON_KEY)
        all_articles = json.loads(result.text)['results']
        
        request_counter += 1
        
        # Append needed info from the json object to a list
        news[ticker].extend(list(map(extract_info_polygon, all_articles)))
            
        # Get the latest date in the dictionary of articles
        dates = np.array(list(map(get_dates, all_articles)))
        try:
            new_date = np.unique(dates)[0]
        except IndexError:
            new_date = END_DATE_ticker.strftime('%Y-%m-%d')# if there is no more historical data for this stock, just set the 
                                                    # new_date to END_DATE so that the loop goes on to the next stock
        
        # If new_date == DATE, manually adjust the new_date to be the previous date        
        if new_date == DATE:
            new_date = (dt.strptime(new_date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')
                
        DATE = new_date


ABBV 0
running for date: 2022-10-16
ACN 1
running for date: 2022-10-16
AEP 2
running for date: 2022-10-16
AIZ 3
running for date: 2022-10-16
ALLE 4
running for date: 2022-10-16
AMAT 5
running for date: 2022-10-16
AMP 6
running for date: 2022-10-16
AMZN 7
running for date: 2022-10-16
AVB 8
running for date: 2022-10-16
AVY 9
running for date: 2022-10-16
AXP 10
running for date: 2022-10-16
BDX 11
running for date: 2022-10-16
BF-B 12
running for date: 2022-10-16
BMY 13
running for date: 2022-10-16
BR 14
running for date: 2022-10-16
CARR 15
running for date: 2022-10-16
CDW 16
running for date: 2022-10-16
CE 17
running for date: 2022-10-16
CHTR 18
running for date: 2022-10-16
CNC 19
running for date: 2022-10-16
CNP 20
running for date: 2022-10-16
COP 21
running for date: 2022-10-16
CTAS 22
running for date: 2022-10-16
CZR 23
running for date: 2022-10-16
DG 24
running for date: 2022-10-16
DPZ 25
running for date: 2022-10-16
DRE 26
running for date: 2022-10-16
DXC 27
running for date: 2022-10-

In [76]:
# Parse the articles into df format
parsed_news = []

for ticker in news.keys():
    for article in news[ticker]:
        date, title, summary = article
        
        # some summaries are missing, then a float is returned, we want to skip those
        if type(summary) != float:
            parsed_news.append([ticker, date, title, summary])
        
columns = ['Ticker', 'Date', 'Title', 'Summary']        
news_df = pd.DataFrame(parsed_news, columns=columns)
news_df['Date'] = pd.to_datetime(news_df['Date'])

In [77]:
# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

# Ticker news
scores = news_df['Summary'].apply(analyzer.polarity_scores).tolist()
df_scores = pd.DataFrame(scores)
news_df = news_df.join(df_scores)

# General market news
scores = general_market_news['Summary'].apply(analyzer.polarity_scores).tolist()
df_scores = pd.DataFrame(scores)
general_market_news = general_market_news.join(df_scores)

In [78]:
# Convert Date columns to strings
general_market_news['Date'] = general_market_news['Date'].dt.date.astype(str)
financial_news['Date'] = financial_news['Date'].dt.date.astype(str)
news_df['Date'] = news_df['Date'].dt.date.astype(str)

# Concat the new articles with the previous ones
ticker_news = pd.concat([prev_ticker_news, news_df])
general_market_news = pd.concat([prev_general_market_news, general_market_news])
financial_news = pd.concat([prev_financial_news, financial_news])

# Drop duplicates (some stocks might not have many articles and so fetching new articles might also bring some already existing ones)
general_market_news.drop_duplicates(subset=['Date', 'Headline', 'Summary'], inplace=True)
financial_news.drop_duplicates(subset=['Date', 'Sentiment Score', 'Sentiment Label', 'Topic'], inplace=True)
ticker_news.drop_duplicates(subset=['Ticker', 'Date', 'Title', 'Summary'], inplace=True)

# Sort articles by Ticker, Date
general_market_news.sort_values(by=['Date'], ascending=False, inplace=True)
financial_news.sort_values(by=['Topic', 'Date'], ascending=False, inplace=True)
ticker_news.sort_values(by=['Ticker', 'Date'], ascending=False, inplace=True)

# Reset index
general_market_news.reset_index(drop=True, inplace=True)
financial_news.reset_index(drop=True, inplace=True)
ticker_news.reset_index(drop=True, inplace=True)

### DataFrames:

<li> general_market_news </li>
<li> financial_news </li>
<li> ticker_news </li>

In [79]:
display(general_market_news.head())
display(general_market_news.shape)
display(financial_news.head())
display(financial_news.shape)
display(ticker_news.head())
display(ticker_news.shape)

Unnamed: 0,Date,Headline,Summary,neg,neu,pos,compound
0,2022-10-16,Peter Thiel Backs German Startup Delivering Dr...,German drone developer Quantum-Systems GmbH ha...,0.0,1.0,0.0,0.0
1,2022-10-16,Goldman Downgrades UK Growth Outlook After Gov...,Analysts at Goldman Sachs Group Inc. have down...,0.0,0.883,0.117,0.5994
2,2022-10-16,"China Traders See Tech Focus, More Covid Gloom...",President Xi Jinping’s two-hour address to the...,0.0,0.922,0.078,0.4019
3,2022-10-16,Your Sunday UK Briefing: Trussonomics Is Over ...,Hello again.Here are a few snippets to ease yo...,0.0,0.704,0.296,0.7906
4,2022-10-16,Here’s What China Experts Are Saying About Xi’...,President Xi Jinping just delivered China’s mo...,0.0,0.909,0.091,0.4019


(199, 7)

Unnamed: 0,Date,Sentiment Score,Sentiment Label,Topic
0,2022-10-16,0.280795,Somewhat-Bullish,financial_market_news
1,2022-10-16,0.012175,Neutral,financial_market_news
2,2022-10-16,0.267708,Somewhat-Bullish,financial_market_news
3,2022-10-16,0.245427,Somewhat-Bullish,financial_market_news
4,2022-10-16,0.023954,Neutral,financial_market_news


(25341, 4)

Unnamed: 0,Ticker,Date,Title,Summary,neg,neu,pos,compound
0,XOM,2022-10-15,Answering Investors' Questions About Diversifi...,Investors have lots of questions. The Motley F...,0.244,0.756,0.0,-0.4404
1,XOM,2022-10-14,"Why Cash Rich Companies, ETFs are Beating the ...",We discuss Cash Cows & Trendpilot ETFs.,0.0,1.0,0.0,0.0
2,XOM,2022-10-14,EOG Resources (EOG) Jumps 42.1% in a Year: Mor...,Higher oil price is aiding EOG Resources (EOG).,0.0,1.0,0.0,0.0
3,XOM,2022-10-14,Schlumberger (SLB) Looking to Divest US Valves...,Schlumberger (SLB) is working to explore a pot...,0.0,0.898,0.102,0.1779
4,XOM,2022-10-14,Equinor (EQNR) Achieves First Oil From Peregri...,Peregrino Phase 2 enables Equinor (EQNR) to br...,0.0,0.853,0.147,0.4767


(40810, 8)

### Write the DataFrames to csv

In [80]:
general_market_news.to_csv('Data/general_market_news.csv')
financial_news.to_csv('Data/financial_news.csv')
ticker_news.to_csv('Data/ticker_news.csv')

### Add Technical Indicators

#### MACD

In [97]:
# create a temporary df for the calculations
temp_df = GOOG_price.copy()

# use the adjusted close price to calculate short-term (12-day) exponential moving average
temp_df['st_ema'] = temp_df['adj_close'].ewm(span = 12.0).mean() 

# use the adjusted close price to calculate long-term (26-day) exponential moving average
temp_df['lt_ema'] = temp_df['adj_close'].ewm(span = 26.0).mean()  

# calculate the MACD with the formula: MACD = st_ema - lt_ema 
temp_df['MACD'] = temp_df['st_ema'] - temp_df['lt_ema']

# calculate MACD's 9-day exponential moving average as its signal line
temp_df['MACD_signal'] = temp_df['MACD'].ewm(span = 9.0).mean()

# add MACD_signal column to the GOOG_price 
GOOG_price['MACD_signal'] = temp_df['MACD_signal']

#### OBV

In [98]:
# create a temporary df for the calculations
temp_df = GOOG_price.copy()

# get the price direction
temp_df['change'] = temp_df['adj_close'].pct_change()
temp_df['price_direction'] = temp_df['change'] / temp_df['change'].abs()

# calculate OBV exponential moving average with center of mass = 5
temp_df['OBV'] = (temp_df['volume'] * temp_df['price_direction']).fillna(0).cumsum().astype(int)
temp_df['OBV ema'] = temp_df['OBV'].ewm(com=5).mean()

# add OBV ema column to the GOOG_price
GOOG_price['OBV ema'] = temp_df['OBV ema']

#### RSI

In [99]:
# create a temporary df for the calculations
temp_df = GOOG_price.copy()

# take difference of adj_close
temp_df['diff'] = temp_df['adj_close'].diff()

# add a new column named gain as the positive part of daily price change
temp_df['gain'] = np.where(temp_df['diff'] >= 0, temp_df['diff'], 0.0)
temp_df['gain'].iloc[0] = np.nan #because the first values gets assigned 0

# add a new column named loss as the negative part of daily price change
temp_df['loss'] = np.where(temp_df['diff'] < 0, temp_df['diff'].abs(), 0.0)
temp_df['loss'].iloc[0] = np.nan #because the first values gets assigned 0

# add a new column named avg_gain as the average gain described before
avg_gain = temp_df['gain'].copy() #just get the gain values
avg_gain.iloc[14] = temp_df.gain.iloc[0:15].mean() #find the mean of the first 14 days
avg_gain.iloc[0:14] = np.repeat(np.nan, 14) #fill values before day 14 with nans
temp_df['avg_gain'] = avg_gain.ewm(alpha = 1/14, adjust = False).mean() #use ewm with 13/14 weight to the previous mean value and 1/14 weight to the new gain value (exactly like in the formula provided to calculate the avg gain)

# add a new column named avg_loss as the avegrge loss described before
avg_loss = temp_df['loss'].copy() #same approach here
avg_loss.iloc[14] = temp_df['loss'].iloc[0:15].mean()
avg_loss.iloc[0:14] = np.repeat(np.nan, 14) 
temp_df['avg_loss'] = avg_loss.ewm(alpha = 1/14, adjust = False).mean()

# add a new column named RSI as the relative strength index described before
temp_df['RSI'] = 100 - 100/(1+(temp_df['avg_gain'] / temp_df['avg_loss'])) #using the given formula

# add RSI column to the GOOG_price
GOOG_price['RSI'] = temp_df['RSI']

# Data Preprocessing

In [100]:
# Group Sentiment Score per day
general_market_news_grouped = general_market_news.groupby(by=['Date'])['compound'].mean().to_frame()
financial_news_score = financial_news.groupby(by=['Topic', 'Date'])['Sentiment Score'].mean().to_frame()
financial_news_label = financial_news.groupby(by=['Topic', 'Date'])['Sentiment Label'].agg(pd.Series.mode).to_frame()
financial_news_grouped = financial_news_score.merge(financial_news_label, on=['Topic', 'Date'], how='left')
ticker_news_grouped = ticker_news.groupby(by=['Ticker', 'Date'])['compound'].mean().to_frame()

# Reset indeces
general_market_news_grouped.reset_index(inplace=True)
financial_news_grouped.reset_index(inplace=True)
ticker_news_grouped.reset_index(inplace=True)

In [101]:
# Create a list of dates with a daily frequency from 2021-01-01 to today
date_range = pd.date_range(dt(2021,1,1), dt.today(), freq='D').to_list()
date_range = list(map(dt.date, date_range))
data_index = list(map(str, date_range))

# Create an empty DataFrame which will contain all our feature data
data = pd.DataFrame({'Date': data_index})

#### Merge all dataframes together

In [102]:
# General Market News
general_market_news_grouped.columns = ['Date', 'general_market_news_compound']
data = data.merge(general_market_news_grouped, on=['Date'], how='left')

In [103]:
# Financial News
unique_topics = financial_news_grouped['Topic'].unique()

for topic in unique_topics:
    
    # Get a df for one topic at a time
    topic_news = financial_news_grouped.loc[financial_news_grouped['Topic'] == topic]
    
    # Rename the columns
    topic_news = topic_news.rename(columns={'Sentiment Score': f'{topic}_sentiment_score', 'Sentiment Label': f'{topic}_sentiment_label'})
    topic_news.drop(columns=['Topic'], inplace=True)

    # Join with the data DataFrame
    data = data.merge(topic_news, on=['Date'], how='left')

In [104]:
# Ticker News (ONLY FOR GOOGLE) TEMPORARY
GOOG_data = ticker_news_grouped.loc[ticker_news_grouped['Ticker'] == 'GOOG']
GOOG_data = GOOG_data.drop(columns=['Ticker'])
GOOG_data.columns = ['Date', 'ticker_news_compound']
data = data.merge(GOOG_data, on=['Date'], how='left')

In [None]:
### TEMPORARY ###
# data = pd.read_csv('data.csv', index_col=0)
# data.tail()

In [106]:
# Forward fill missing values
data = data.ffill(axis=0)

# Add quarters
data['quarter'] = pd.PeriodIndex(data['Date'], freq='Q')

# Add price data
data = data.merge(GOOG_price, on=['Date'], how='left')

# Keep only trading days
data = data.loc[data['adj_close'].notna()]

# Convert price to return
data['adj_close'] = data['adj_close'].pct_change()

# Drop general_market_news_compound column b/c it's almost all NaN
data.drop(columns=['general_market_news_compound'], inplace=True)

# Drop NaNs (drops many columns)
data.dropna(inplace=True)

In [132]:
# Some dates have arrays of sentiment, extract only the first sentiment value from them
data['economy_fiscal_news_sentiment_label'] = data['economy_fiscal_news_sentiment_label'].apply(lambda row_val: row_val[0] if type(row_val) != str else row_val)
data['economy_macro_news_sentiment_label'] = data['economy_macro_news_sentiment_label'].apply(lambda row_val: row_val[0] if type(row_val) != str else row_val)
data['economy_monetary_news_sentiment_label'] = data['economy_monetary_news_sentiment_label'].apply(lambda row_val: row_val[0] if type(row_val) != str else row_val)
data['financial_market_news_sentiment_label'] = data['financial_market_news_sentiment_label'].apply(lambda row_val: row_val[0] if type(row_val) != str else row_val)

In [133]:
# Encode categorical variables
encoder = LabelEncoder()
data['economy_fiscal_news_sentiment_label'] = encoder.fit_transform(data['economy_fiscal_news_sentiment_label'])
data['economy_macro_news_sentiment_label'] = encoder.fit_transform(data['economy_macro_news_sentiment_label'])
data['economy_monetary_news_sentiment_label'] = encoder.fit_transform(data['economy_monetary_news_sentiment_label'])
data['financial_market_news_sentiment_label'] = encoder.fit_transform(data['financial_market_news_sentiment_label'])

In [134]:
data.tail()

Unnamed: 0,economy_fiscal_news_sentiment_score,economy_fiscal_news_sentiment_label,economy_macro_news_sentiment_score,economy_macro_news_sentiment_label,economy_monetary_news_sentiment_score,economy_monetary_news_sentiment_label,financial_market_news_sentiment_score,financial_market_news_sentiment_label,ticker_news_compound,adj_close,volume,MACD_signal,OBV ema,RSI
647,0.050678,1,-0.025563,1,0.04492,0,0.142222,2,0.351444,-0.008637,16529900.0,-1.817585,337735000.0,34.668922
648,0.050678,1,0.046312,1,0.04492,0,0.142222,2,0.216308,-0.006686,21617700.0,-1.767101,330509600.0,32.734767
649,0.055676,1,0.023954,1,0.04492,0,0.142222,2,0.338056,0.00255,17343400.0,-1.723846,327379100.0,34.231526
650,0.074989,1,0.046494,1,0.04492,0,0.142222,2,0.160657,0.014344,32812200.0,-1.652987,330239000.0,42.061992
651,0.070695,1,0.037788,1,0.04492,0,0.142222,2,0.343429,-0.025374,22612900.0,-1.611316,328853400.0,34.194841


In [112]:
# drop date and quarted columns
data = data.iloc[:, 1:]
data.drop(columns='quarter', inplace=True)

### PyCaret

In [135]:
reg = setup(data=data, target='adj_close')

Unnamed: 0,Description,Value
0,session_id,4059
1,Target,adj_close
2,Original Data,"(156, 14)"
3,Missing Values,0
4,Numeric Features,9
5,Categorical Features,4
6,Ordinal Features,0
7,High Cardinality Features,0
8,High Cardinality Method,
9,Transformed Train Set,"(109, 21)"


In [136]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
omp,Orthogonal Matching Pursuit,0.0168,0.0004,0.02,0.2301,0.0168,3.0352,0.005
br,Bayesian Ridge,0.0173,0.0005,0.0208,0.1724,0.0176,2.9045,0.005
ridge,Ridge Regression,0.0176,0.0005,0.021,0.145,0.0173,3.4549,0.006
ada,AdaBoost Regressor,0.0177,0.0005,0.0215,0.1233,0.0163,2.419,0.022
rf,Random Forest Regressor,0.0184,0.0006,0.0226,0.0267,0.0168,1.9943,0.058
lightgbm,Light Gradient Boosting Machine,0.0186,0.0006,0.023,-0.0013,0.0169,2.7225,0.119
dummy,Dummy Regressor,0.0189,0.0006,0.0236,-0.0686,0.0211,1.9311,0.005
llar,Lasso Least Angle Regression,0.0189,0.0006,0.0236,-0.0686,0.0211,1.9311,0.005
lr,Linear Regression,0.0191,0.0006,0.0237,-0.0711,0.0202,1.6779,0.645
lasso,Lasso Regression,0.0191,0.0006,0.0237,-0.0711,0.0202,1.6779,0.007


OrthogonalMatchingPursuit(fit_intercept=True, n_nonzero_coefs=None,
                          normalize=True, precompute='auto', tol=None)

  model = cd_fast.enet_coordinate_descent(
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  model = cd_fast.enet_coordinate_descent(
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,






In [14]:
# Divide dataset into X and y
features = ['economy_fiscal_news_sentiment_score',
       'economy_fiscal_news_sentiment_label',
       'economy_macro_news_sentiment_score',
       'economy_macro_news_sentiment_label',
       'economy_monetary_news_sentiment_score',
       'economy_monetary_news_sentiment_label',
       'financial_market_news_sentiment_score',
       'financial_market_news_sentiment_label', 'ticker_news_compound']

y = data['GOOG']
X = data[features]

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [15]:
# Divide X and y into train and test sets
y_train = y[:120]
X_train = X[:120]

y_test = y[120:]
X_test = X[120:]

# Finding the Best Model

In [None]:

params = {'n_estimators':[10,50,100,250], 'max_depth':[5,10,20], 'class_weight':[None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]}

clf = RandomForestClassifier(random_state=42)
gs = GridSearchCV(clf, params, scoring='roc_auc', n_jobs=-1)
gs.fit(X_train, y_train)

print("Best set of Parameters",gs.best_params_)
print("Best Score",gs.best_score_)

In [None]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'LogisticRegression': LogisticRegression(),
    'SVR': SVR(),
    'RandomForestRegressor': RandomForestRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'MLPRegressor': MLPRegressor()
}

In [92]:
np.linspace(0.1, 10, 20)

array([ 0.1       ,  0.62105263,  1.14210526,  1.66315789,  2.18421053,
        2.70526316,  3.22631579,  3.74736842,  4.26842105,  4.78947368,
        5.31052632,  5.83157895,  6.35263158,  6.87368421,  7.39473684,
        7.91578947,  8.43684211,  8.95789474,  9.47894737, 10.        ])

In [93]:
params = {'alpha': np.linspace(0.1, 10, 20)}

In [16]:
model = Ridge(alpha=0)
reg = model.fit(X_train, y_train)
display(reg.score(X_train, y_train))

y_pred = reg.predict(X_test)

display(mean_squared_error(y_test, y_pred))

0.04334282307425952

0.0004586633263684654

In [17]:
params = {'alpha': np.linspace(0.01, 10, 200)}
model = Ridge()

grid_search = GridSearchCV(model, param_grid=params, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [18]:
grid_search.best_score_

-0.18257697473978424

In [19]:
grid_search.best_params_

{'alpha': 10.0}