In [1]:
import pandas as pd
from transformers import pipeline, PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration
from all_companies_news import *
from tqdm import tqdm
from datetime import datetime, timedelta
import yfinance as yf
import math
from math import pi

In [2]:
model_path = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
model = pipeline("text-classification", model=model_path , tokenizer=model_path, max_length=600, truncation=True)

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
summarizer = PegasusForConditionalGeneration.from_pretrained(model_name) # If you want to use the Tensorflow model 

In [4]:
companies = ['AAPL', 'ABBV', 'ABT', 'ACN', 'ADBE', 'AIG', 'AMD', 'AMGN', 'AMT', 'AMZN', 'AVGO', 'AXP', 'BA', 'BAC', 'BK', 'BKNG', 'BLK', 'BMY', 'BRK.B', 'C', 'CAT', 'CHTR', 'CL', 'CMCSA', 'COF', 'COP', 'COST', 'CRM', 'CSCO', 'CVS', 'CVX', 'DHR', 'DIS', 'DOW', 'DUK', 'EMR', 'EXC', 'F', 'FDX', 'GD', 'GE', 'GILD', 'GM', 'GOOG', 'GOOGL', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'KHC', 'KO', 'LIN', 'LLY', 'LMT', 'LOW', 'MA', 'MCD']

In [5]:
day_of_week = []
day = []
month = []
year = []
sentiment = []
output_class = []
symbol = []

In [None]:
for c in tqdm(companies):
    for news in (comp[c]):
        act_date = datetime.fromtimestamp(news['datetime'])
        article = news['summary']
        day_of_week.append(act_date.weekday())
        day.append(act_date.day)
        month.append(act_date.month)
        year.append(act_date.year)
        score = model(article)
        sentiment.append(score[0]['label'])
        symbol.append(c)
        start_date = None
        end_date = None
        if act_date.weekday() == 5:
        #initial price to be considered is closing price of friday and closing price of monday
            start_date = (act_date-timedelta(days=1)).strftime("%Y-%m-%d")
            end_date = (act_date+timedelta(days=3)).strftime("%Y-%m-%d")
            data = yf.download(c, start=start_date, end=end_date)
            while(len(data) == 0):
                start_date = (datetime.strptime(start_date,"%Y-%m-%d")-timedelta(days=1)).strftime("%Y-%m-%d")
                data = yf.download(c, start=start_date, end=end_date)
            start_price = data['Close'].values[0]
            while(len(data) == 1):
                end_date = (datetime.strptime(end_date,"%Y-%m-%d")+timedelta(days=1)).strftime("%Y-%m-%d")
                data = yf.download(c, start=start_date, end=end_date)
            end_price = data['Close'].values[1]
            if end_price-start_price>0: output_class.append(1)
            else: output_class.append(0)
        elif act_date.weekday() == 6:
        #initial price to be considered is closing price of friday and closing price of monday
            start_date = (act_date-timedelta(days=2)).strftime("%Y-%m-%d")
            end_date = (act_date+timedelta(days=2)).strftime("%Y-%m-%d")
            data = yf.download(c, start=start_date, end=end_date)
            while(len(data) == 0):
                start_date = (datetime.strptime(start_date,"%Y-%m-%d")-timedelta(days=1)).strftime("%Y-%m-%d")
                data = yf.download(c, start=start_date, end=end_date)
            start_price = data['Close'].values[0]
            while(len(data) == 1):
                end_date = (datetime.strptime(end_date,"%Y-%m-%d")+timedelta(days=1)).strftime("%Y-%m-%d")
                data = yf.download(c, start=start_date, end=end_date)
            end_price = data['Close'].values[1]
            if end_price-start_price>0: output_class.append(1)
            else: output_class.append(0)
        else:
            start_date = act_date.strftime("%Y-%m-%d")
            end_date = (act_date+timedelta(days=1)).strftime("%Y-%m-%d")
            data = yf.download(c, start=start_date, end=end_date)
            while(len(data) == 0):
                start_date = (datetime.strptime(start_date,"%Y-%m-%d")+timedelta(days=1)).strftime("%Y-%m-%d")
                end_date = (datetime.strptime(end_date,"%Y-%m-%d")+timedelta(days=1)).strftime("%Y-%m-%d")
                data = yf.download(c, start=start_date, end=end_date)
            start_price = data['Open'].values[0]
            end_price = data['Close'].values[0]
            if end_price-start_price>0: output_class.append(1)
            else: output_class.append(0)

In [29]:
temp = pd.DataFrame()

In [30]:
temp['day_of_week'] = day_of_week
temp['day'] = day
temp['month'] = month
temp['year'] = year
temp['sentiment'] = sentiment
temp['symbol'] = symbol
temp['output_class'] = output_class

In [32]:
def one_hot_sentiment(sentiment, neg:list, neu:list, pos:list):
    if sentiment == 'negative':
        neg.append(1)
        neu.append(0)
        pos.append(0)
    elif sentiment == 'positive':
        neg.append(0)
        neu.append(0)
        pos.append(1)
    else:
        neg.append(0)
        neu.append(1)
        pos.append(0)


In [34]:
def transform_month(column):
  max_value = 12
  sin_values = [math.sin((2*pi*x)/max_value) for x in list(column)]
  cos_values = [math.cos((2*pi*x)/max_value) for x in list(column)]
  return sin_values, cos_values

In [35]:
def transformation(column):
  max_value = column.max()
  sin_values = [math.sin((2*pi*x)/max_value) for x in list(column)]
  cos_values = [math.cos((2*pi*x)/max_value) for x in list(column)]
  return sin_values, cos_values

In [51]:
neg = []
neu = []
pos = []

In [52]:
day_sin, day_cos = transformation(temp['day'])
month_sin, month_cos = transform_month(temp['month'])
day_of_week_sin, day_of_week_cos = transformation(temp['day_of_week'])

In [53]:
symbol_dict = {}
for s in range(len(companies)):
    symbol_dict[companies[s]] = s+1

In [54]:
symbol_numerical = []
for index, row in temp.iterrows():
    one_hot_sentiment(row['sentiment'], neg, neu, pos)
    symbol_numerical.append(symbol_dict[row['symbol']])

In [55]:
df_out = pd.DataFrame()
df_out['day_sin'] = day_sin
df_out['day_cos'] = day_cos
df_out['month_sin'] = month_sin
df_out['month_cos'] = month_cos
df_out['year'] = temp['year']
df_out['day_of_week_sin'] = day_of_week_sin
df_out['day_of_week_cos'] = day_of_week_cos
df_out['symbol'] = symbol_numerical
df_out['negative'] = neg
df_out['neutral'] = neu
df_out['positive'] = pos
df_out['class'] = output_class

In [58]:
df_out.to_csv('large.csv', index=False)