In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import tensorflow as tf
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import yfinance as yf
import holidays
import pickle

max_features = 20000
maxlen = 100
# loading tokenizer
with open('../input/lstmmodel/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install holidays

In [None]:
model = tf.keras.models.load_model('../input/lstmmodel/Model1-LSTM.h5')

In [None]:
data1 = pd.read_csv('/kaggle/input/massive-stock-news-analysis-db-for-nlpbacktests/analyst_ratings_processed.csv', index_col=0)
data1.dropna(inplace = True)
data1.rename(columns={'stock':'ticker'}, inplace=True)
data1['date'] = data1['date'].apply(lambda x : x.split()[0])
data2 = pd.read_csv('/kaggle/input/us-equities-news-data/us_equities_news_dataset.csv', index_col=0)
data2.dropna(inplace = True)
data2.reset_index(drop=True, inplace=True)
data2.rename(columns={'release_date':'date'}, inplace=True)
data2.drop(inplace=True, columns=['category', 'content', 'provider', 'url', 'article_id'], axis=1)

In [None]:
data = pd.concat([data1, data2])
data.drop_duplicates(subset='title', keep='first', inplace=True)

In [None]:
tickerSymbol = "MSFT"

In [None]:
tmpData = {}
total = data['date'].nunique()
for i in tqdm(data[data['ticker']==tickerSymbol]['date'].unique()):
    tmpData[i] = data.loc[(data['ticker']==tickerSymbol) & (data['date'] == i)]['title'].tolist()

In [None]:
ONE_DAY = datetime.timedelta(days=1)
HOLIDAYS_US = holidays.US()
def next_business_day(dateString):
    datetimeObj = datetime.datetime.strptime(dateString, '%Y-%m-%d')
    next_day = datetimeObj + ONE_DAY
    while next_day.weekday() in holidays.WEEKEND or next_day in HOLIDAYS_US:
        next_day += ONE_DAY
    return next_day

In [None]:
def findPercentageBySentences(sentenceList):
    posAvg, negAvg, neuAvg = 0, 0, 0
    for sentence in sentenceList:
        token = tokenizer.texts_to_sequences([sentence])
        X = pad_sequences(token, maxlen=maxlen)
        sentiment_dict = model.predict(X).tolist()[0]
        negAvg += sentiment_dict[0]
        neuAvg += sentiment_dict[1]
        posAvg += sentiment_dict[2]
    posAvg=(posAvg/len(sentenceList))*100
    negAvg=(negAvg/len(sentenceList))*100
    neuAvg=(neuAvg/len(sentenceList))*100
    return {'numArticles': len(sentenceList), 'pos': posAvg, 'neg': negAvg, 'neu' : neuAvg}

In [None]:
dateSentimentGroups = {}
for i in tqdm(tmpData):
    scores = findPercentageBySentences(tmpData[i])
    dateSentimentGroups[i] = scores

In [None]:
data = []
ticker = yf.Ticker(tickerSymbol)
hist = ticker.history(period="max")
for i in tqdm(dateSentimentGroups):
  start = i
  nextDay = next_business_day(start).strftime("%Y-%m-%d")
  try:
    prevDay = hist.loc[start]
    nextDay = hist.loc[nextDay]
    percentageChange = ((nextDay['Close']-prevDay['Open'])/prevDay['Open'])*100
    data.append([i, dateSentimentGroups[i]['numArticles'], dateSentimentGroups[i]['neu'], dateSentimentGroups[i]['pos'], dateSentimentGroups[i]['neg'], percentageChange])
  except:
    pass

In [None]:
df = pd.DataFrame(columns =['date', 'numArticles', 'neutral', 'positive','negative','percentageChange'], data=data)


In [None]:
df

In [None]:
from sklearn import linear_model


X = df[['numArticles','neutral','positive','negative']]
y = df['percentageChange']

regr = linear_model.LinearRegression()
regr.fit(X, y)

In [None]:
y_pred = regr.predict([[10, 8.078922e-04, 69.504652,30.494541]])

print(y_pred)

In [None]:
X = df[['numArticles','neutral','positive','negative']]
y = df['percentageChange']


from sklearn.preprocessing import PolynomialFeatures
  
poly = PolynomialFeatures(degree = 3)
X_poly = poly.fit_transform(X)
  
poly.fit(X_poly, y)
lin2 = LinearRegression()
lin2.fit(X_poly, y)

In [None]:
lin2.predict(poly.fit_transform([[1,9.756931e-08,11.408260,88.591737]]))


In [None]:
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm


X = df[['numArticles','neutral','positive','negative']]
 # here we have 2 variables for multiple regression. If you just want to use one variable for simple linear regression, then use X = df['Interest_Rate'] for example.Alternatively, you may add additional variables within the brackets
Y = df['percentageChange']
 
# with sklearn
regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

# prediction with sklearn

print ('Predicted Percent Change: \n', regr.predict([[10, 8.078922e-04, 69.504652,30.494541]]))

# with statsmodels
X = sm.add_constant(X) # adding a constant
 
model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 
 
print_model = model.summary()
print(print_model)