## Load the data

In [None]:
# %pip install pandas
# %pip install datasets
# %pip install yfinance

In [None]:
import yfinance as yf
import pandas as pd
import datetime
from datasets import load_dataset

In [None]:
dataset = load_dataset("edarchimbaud/news-stocks")
dataset.set_format(type='pandas')

df = dataset['train'][:]
df

In [None]:
from tqdm import tqdm

tickers = list(df['symbol'].unique())

def returns(date):
  # returns are calculated using the close price from the previous trading day
  # and the open price of the next trading day to include the moment when the
  # news was made public (during or outside the trading hours)
  prev_date = prices[prices.index < date].index.max()
  prev_date_index = prices.index.get_loc(prev_date.strftime('%Y-%m-%d'))

  if date.strftime('%Y-%m-%d') == prev_date.strftime('%Y-%m-%d'):
    prev_date_index -= 1

  next_date = prices[prices.index > date].index.min()
  next_date_index = prices.index.get_loc(next_date.strftime('%Y-%m-%d'))

  if date.strftime('%Y-%m-%d') == next_date.strftime('%Y-%m-%d'):
    next_date_index += 1

  # ret = ((prices['Open'][next_date_index] - prices['Close'][prev_date_index]) / prices['Close'][prev_date_index]) * 100
  ret = ((prices.iloc[next_date_index]['Open'] - prices.iloc[prev_date_index]['Close']) / prices.iloc[prev_date_index]['Close']) * 100
  
  return ret


for ticker in tqdm(tickers):
    start_date = df[df['symbol'] == ticker]['publish_time'].min() - datetime.timedelta(days=7)
    end_date = df[df['symbol'] == ticker]['publish_time'].max() + datetime.timedelta(days=7)
    prices = yf.download(ticker, start_date, end_date, progress=False)
    if len(prices) == 0:
      df = df[df['symbol'] != ticker]
      # tickers.remove(ticker)
    else:
      prices.index = pd.to_datetime(prices.index, format='%Y-%m-%d', utc=True)
      df.loc[df['symbol'] == ticker, 'returns'] = df[df['symbol'] == ticker]['publish_time'].apply(returns)


df

In [None]:
print(df['returns'].min())
print(df['returns'].max())

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
df['returns'].plot.hist(ax=ax[0], bins=1000)
df['returns'].plot.hist(ax=ax[1], bins=1000, xlim=(-25,25))
plt.show()

In [None]:
print(len(df[df['returns'] < -3]))
print(len(df[df['returns'] > 3]))
print(len(df) - len(df[df['returns'] < -3]) - len(df[df['returns'] > 3]))

In [None]:
dataset = df.drop(['publisher', 'url', 'uuid'], axis=1)

dataset['publish_time'] = pd.to_datetime(dataset['publish_time'], format='%Y-%m-%d %H:%M:%S', utc=True).dt.date
dataset = dataset.groupby(['symbol', 'publish_time']).agg({'title': ' '.join, 'body': ' '.join, 'returns': 'mean'}).reset_index()
# dataset = dataset.groupby(['symbol', 'publish_time']).agg({'title': list, 'body': list, 'returns': 'mean'}).reset_index()

dataset

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
dataset['returns'].plot.hist(ax=ax[0], bins=1000)
dataset['returns'].plot.hist(ax=ax[1], bins=1000, xlim=(-25,25))
plt.show()

In [None]:
print(len(dataset[dataset['returns'] < -3]))
print(len(dataset[dataset['returns'] > 3]))
print(len(dataset) - len(dataset[dataset['returns'] < -3]) - len(dataset[dataset['returns'] > 3]))

In [None]:
dataset['trend'] = pd.cut(dataset['returns'], bins=[-100, -3, 3, 100], labels=['decrease', 'stable', 'increase'])

dataset

In [None]:
dataset = dataset.drop(dataset[dataset['trend'] == 'stable'].sample(frac=0.9).index).reset_index(drop=True)

dataset

In [None]:
dataset[dataset['symbol'] == 'ABEO']

In [None]:
dataset['trend'].value_counts().plot(kind='bar')

In [None]:
dataset.to_csv('dataset.csv', index=False)