In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install yfinance
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import datetime
import matplotlib.pyplot as plt
import tensorflow as tf
import torch
import re
import yfinance as yf
import holidays
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch.nn.functional as F
def SentimentAnalyzer(doc):
    pt_batch = tokenizer(doc,padding=True,truncation=True,max_length=512,return_tensors="pt")
    outputs = model(**pt_batch)
    pt_predictions = F.softmax(outputs.logits, dim=-1)
    return pt_predictions.detach().cpu().numpy()

In [None]:
data1 = pd.read_csv('/kaggle/input/massive-stock-news-analysis-db-for-nlpbacktests/analyst_ratings_processed.csv', index_col=0)
data1.dropna(inplace = True)
data1.rename(columns={'stock':'ticker'}, inplace=True)
data1['date'] = data1['date'].apply(lambda x : x.split()[0])
data2 = pd.read_csv('/kaggle/input/us-equities-news-data/us_equities_news_dataset.csv', index_col=0)
data2.dropna(inplace = True)
data2.reset_index(drop=True, inplace=True)
data2.rename(columns={'release_date':'date'}, inplace=True)
data2.drop(inplace=True, columns=['category', 'content', 'provider', 'url', 'article_id'], axis=1)

In [None]:
data = pd.concat([data1, data2])
data.drop_duplicates(subset='title', keep='first', inplace=True)

In [None]:
tickerSymbol = "MSFT"

In [None]:
tmpData = {}
total = data['date'].nunique()
for i in tqdm(data[data['ticker']==tickerSymbol]['date'].unique()):
    tmpData[i] = data.loc[(data['ticker']==tickerSymbol) & (data['date'] == i)]['title'].tolist()

In [None]:
ONE_DAY = datetime.timedelta(days=1)
HOLIDAYS_US = holidays.US()
def next_business_day(dateString):
    datetimeObj = datetime.datetime.strptime(dateString, '%Y-%m-%d')
    next_day = datetimeObj + ONE_DAY
    while next_day.weekday() in holidays.WEEKEND or next_day in HOLIDAYS_US:
        next_day += ONE_DAY
    return next_day

In [None]:
def findPercentageBySentences(sentenceList):
    posAvg, negAvg, neuAvg = 0, 0, 0
    sentimentArr = SentimentAnalyzer(sentenceList)
    sentimentArr = np.mean(sentimentArr, axis=0)
    posAvg=sentimentArr[0]
    negAvg=sentimentArr[1]
    neuAvg=sentimentArr[2]
    return {'numArticles': len(sentenceList), 'pos': posAvg, 'neg': negAvg, 'neu' : neuAvg}

In [None]:
dateSentimentGroups = {}
for i in tqdm(tmpData):
    scores = findPercentageBySentences(tmpData[i])
    dateSentimentGroups[i] = scores

In [None]:
data = []
ticker = yf.Ticker(tickerSymbol)
hist = ticker.history(period="max")
for i in tqdm(dateSentimentGroups):
  start = i
  nextDay = next_business_day(start).strftime("%Y-%m-%d")
  try:
    prevDay = hist.loc[start]
    nextDay = hist.loc[nextDay]
#     data.append([i, dateSentimentGroups[i]['numArticles'], dateSentimentGroups[i]['neu'], dateSentimentGroups[i]['pos'], dateSentimentGroups[i]['neg'], percentageChange])
    data.append([dateSentimentGroups[i]['numArticles'], dateSentimentGroups[i]['neu'], dateSentimentGroups[i]['pos'], dateSentimentGroups[i]['neg'], prevDay['Open'], prevDay['Close']])
  except:
    pass

In [None]:
# df = pd.DataFrame(columns =['date', 'numArticles', 'neutral', 'positive','negative','percentageChange'], data=data)
df = pd.DataFrame(columns =['numArticles', 'neutral', 'positive','negative','Open', 'Close'], data=data)

In [None]:
df

In [None]:
X = df[['neutral', 'positive', 'negative', 'Open']]
y = df['Close']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.1)

In [None]:
normalizer = preprocessing.Normalization()
normalizer.adapt(np.array(X))

In [None]:
def build_and_compile_model(norm):
    model = Sequential()
    model.add(norm)
    model.add(Dense(256, activation=lambda x : tf.nn.leaky_relu(x, alpha=0.01),  input_dim=3))
    model.add(Dense(256, activation=lambda x : tf.nn.leaky_relu(x, alpha=0.01)))
    model.add(Dense(1, activation="linear"))
    model.compile(loss='mean_squared_error', optimizer=Adam(lr=1e-3, decay=1e-3 / 200))
    return model

In [None]:
dnn_model = build_and_compile_model(normalizer)

In [None]:
dnn_model.summary()

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200)
history = dnn_model.fit(X_train, Y_train, validation_split=0.2, epochs=1000, batch_size=100, verbose=1, callbacks=[es])

In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error [Closing Price]')
  plt.legend()
  plt.grid(True)

In [None]:
plot_loss(history)

In [None]:
preds = dnn_model.predict(X_test)

In [None]:
dnn_model.evaluate(X_test, Y_test, verbose=1)

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10
X = [i for i in range(len(preds))]
  
# Assign variables to the y axis part of the curve
y = preds
z = Y_test
  
# Plotting both the curves simultaneously
plt.scatter(X, y, color='r', label='Predicted Close')
plt.scatter(X, z, color='g', label='Actual Close')
  
# Naming the x-axis, y-axis and the whole graph
plt.ylabel("Closing Price")
plt.title("Predicted vs Actual Close")
  
# Adding legend, which helps us recognize the curve according to it's color
plt.legend()
  
# To load the display window
plt.show()

In [None]:
dnn_model.save("finbert_msft.h5")