1) Извлекаем тикеры из новости

2) Смотрим цену открытия и закрытия на дату новости

3) Предсказываем, падает или растет


## Извлекаем тикеры из новостей

In [0]:
!unzip -q data.zip -d data

In [0]:
!pip install iexfinance

In [0]:
from tqdm import tqdm
import re
import os
import pandas as pd
from datetime import datetime

import iexfinance as fn

In [0]:
# Прочитаем новости из файлов

directory = 'data/data'
files = os.listdir(directory)

articles = []
timestamps = []  # Из всей метаинформации нам интересны только таймстампы
token_filenames = []

for file in tqdm(files):
    
    if file.endswith(".csv"):
        timestamps.append(pd.read_csv(os.path.join(directory, file))['timestamp'][0])
        try:
            with open(os.path.join(directory, file.replace('.csv', '.txt')), 'r') as f:
                articles.append(f.read().replace('\n', ' '))
        except FileNotFoundError:
            timestamps = timestamps[:-1]

assert len(articles) == len(timestamps)

In [0]:
# Извлечем и отфильтруем возможные тикеры

max_ticker_length = 15
stopword = 'GMT'

tickers = []
for article in articles:
    
    possible_tickers = re.findall(r"\(([^\)]+)\)", article)
    possible_tickers = list(filter(
        lambda x: len(x) <= max_ticker_length and 
        ':' in x and  # Будем брать только тикеры на биржах, а не индексы, например
        stopword not in x,  # Попадаются выражения вида 08:00 GMT
        possible_tickers))
    
    tickers.append(possible_tickers)
    
assert len(tickers) == len(articles)

In [0]:
# Убираем новости, для которых нет тикеров

def remove_empty_tickers(articles, timestamps, tickers, targets=None):
    filtered_articles = []
    filtered_timestamps = []
    filtered_tickers = []
    filtered_targets = []
    for i, article in enumerate(articles):
        if len(tickers[i]) > 0:
            filtered_articles.append(article)
            filtered_timestamps.append(timestamps[i])
            filtered_tickers.append(tickers[i])
            
            if targets:
                filtered_targets.append(targets[i])

    assert len(filtered_articles) == len(filtered_timestamps) == len(filtered_tickers)
    return filtered_articles, filtered_timestamps, filtered_tickers, filtered_targets

In [0]:
articles, timestamps, tickers, _ = remove_empty_tickers(articles, timestamps, tickers)
print(len(articles))

## Собираем датасет с динамикой стоимости акций

https://iextrading.com/developer/docs/

https://addisonlynch.github.io/iexfinance/stable/stocks.html

Target:

<SELL\> - отрицательная динамика

<BUY\> - положительная динамика

<KEEP\> - нейтральная динамика (изменение меньше 0.1%)

In [0]:
neutral_threshold = 0.001

targets = []
filtered_tickers = []
for i, timestamp in tqdm(enumerate(timestamps)):
    
    day_string = timestamp.split(' ')[0]
    day = datetime.strptime(day_string, '%Y-%m-%d')
    
    target = []
    current_filtered_tickers = []
    for ticker in tickers[i]:
        short_ticker = ticker.split(':')[-1]  # E.g. NASDAQ:AAPL -> AAPL
        
        try:
            price = fn.get_historical_data(symbols=short_ticker, 
                                           start=day, end=day)[short_ticker][day_string]
            change = price['close'] - price['open']
            if abs(change / price['close']) < neutral_threshold:
                target.append('<KEEP>')
            else:
                target.append('<BUY>' if change > 0 else '<SELL>')
            current_filtered_tickers.append(ticker)
                
        except fn.utils.exceptions.IEXSymbolError:
            pass  # Не нашли нужный тикер
            
        except KeyError:
            pass  # Может вернуться пустой dict в запросе
            
    targets.append(target)
    filtered_tickers.append(current_filtered_tickers)
    
    
articles, timestamps, tickers, targets = remove_empty_tickers(
    articles, timestamps, filtered_tickers, targets)
print(len(articles))

In [0]:
import pickle

data = {
    'articles': articles,
    'timestamp': timestamp,
    'tickers': tickers,
    'targets': targets
}

pickle.dump(data, open('data.pkl', 'wb'))

In [0]:
from google.colab import files

files.download('data.pkl')