## Извлекаем тикеры из новостей

In [0]:
!unzip -q data_fresh.zip -d data_fr

In [55]:
!pip install iexfinance



In [0]:
from tqdm import tqdm
import re
import os
import pandas as pd
from datetime import datetime, timedelta

import iexfinance as fn

In [106]:
# Прочитаем новости из файлов

directory = 'data_fr/data_fresh'
files = os.listdir(directory)

articles = []
timestamps = []  # Из всей метаинформации нам интересны только таймстампы
token_filenames = []

for file in tqdm(files[:800]):
    
    if file.endswith(".csv"):
        timestamps.append(pd.read_csv(os.path.join(directory, file))['timestamp'][0])
        try:
            with open(os.path.join(directory, file.replace('.csv', '.txt')), 'r') as f:
                articles.append(f.read().replace('\n', ' '))
        except FileNotFoundError:
            timestamps = timestamps[:-1]

assert len(articles) == len(timestamps)




  0%|          | 0/800 [00:00<?, ?it/s][A[A[A


 26%|██▌       | 206/800 [00:00<00:00, 2049.68it/s][A[A[A


 59%|█████▉    | 474/800 [00:00<00:00, 2198.82it/s][A[A[A


 92%|█████████▏| 736/800 [00:00<00:00, 2296.02it/s][A[A[A


100%|██████████| 800/800 [00:00<00:00, 2313.15it/s][A[A[A

In [0]:
# Извлечем и отфильтруем возможные тикеры

max_ticker_length = 15
stopword = 'GMT'

tickers = []
for article in articles:
    
    possible_tickers = re.findall(r"\(([^\)]+)\)", article)
    possible_tickers = list(filter(
        lambda x: len(x) <= max_ticker_length and 
        ':' in x and  # Будем брать только тикеры на биржах, а не индексы, например
        stopword not in x,  # Попадаются выражения вида 08:00 GMT
        possible_tickers))
    
    tickers.append(possible_tickers)
    
assert len(tickers) == len(articles)

In [0]:
# Убираем новости, для которых нет тикеров

def remove_empty_tickers(articles, timestamps, tickers, targets=None):
    filtered_articles = []
    filtered_timestamps = []
    filtered_tickers = []
    filtered_targets = []
    for i, article in enumerate(articles):
        if len(tickers[i]) > 0:
            filtered_articles.append(article)
            filtered_timestamps.append(timestamps[i])
            filtered_tickers.append(tickers[i])
            
            if targets:
                filtered_targets.append(targets[i])

    assert len(filtered_articles) == len(filtered_timestamps) == len(filtered_tickers)
    return filtered_articles, filtered_timestamps, filtered_tickers, filtered_targets

In [109]:
articles, timestamps, tickers, _ = remove_empty_tickers(articles, timestamps, tickers)
print(len(articles))

167


## Собираем датасет с динамикой стоимости акций

In [110]:
time = datetime.strptime("15:22", '%H:%M')
day = datetime.strptime("2018-11-28", '%Y-%m-%d')

# И timestamp, и данные IEX в ET timezone

if time < datetime.strptime("09:30", '%H:%M'):
    time = datetime.strptime("09:30", '%H:%M')
    day += timedelta(days=1)
    time = datetime.strptime("09:30", '%H:%M')
    
chart = fn.stocks.get_historical_intraday("AAPL", day)

price = None
for point in chart:
    if point['minute'] == datetime.strftime(time, '%H:%M'):
        price = point['average']
        
price

179.719

In [112]:
neutral_threshold = 0.001
delay_mins = 1  # Чтобы не совершать сделку прямо в момент выхода новости, введем задержку

filtered_tickers = []
prices = []
for i, timestamp in tqdm(enumerate(timestamps)):
    
    day_string = timestamp.split(' ')[0]
    day = datetime.strptime(day_string, '%Y-%m-%d')
    
    # И timestamp, и данные IEX в ET timezone
    time = datetime.strptime(timestamp.split(' ')[1], '%H:%M')
    if time < datetime.strptime("09:30", '%H:%M'):
        time = datetime.strptime("09:30", '%H:%M')
    if time > datetime.strptime("16:00", '%H:%M'):
        day += timedelta(days=1)
        time = datetime.strptime("09:30", '%H:%M')
        
    time += timedelta(minutes=delay_mins)
    time = datetime.strftime(time, '%H:%M')
    
    current_filtered_tickers = []
    article_prices = []
    for ticker in tickers[i]:
        short_ticker = ticker.split(':')[-1]  # E.g. NASDAQ:AAPL -> AAPL
        
        try:
            chart = fn.stocks.get_historical_intraday(symbol=short_ticker, date=day)

            price = None
            for point in chart:
                if point['minute'] == time:
                    price = point['average']
                    break

            article_prices.append(price)
            
            current_filtered_tickers.append(ticker)
                
        except fn.utils.exceptions.IEXQueryError:
            pass  # Не нашли нужный тикер
            
        except KeyError:
            pass  # Может вернуться пустой dict в запросе
            
    filtered_tickers.append(current_filtered_tickers)
    prices.append(article_prices)




0it [00:00, ?it/s][A[A[A


1it [00:02,  2.73s/it][A[A[A


2it [00:23,  8.00s/it][A[A[A


3it [00:25,  6.33s/it][A[A[A


4it [00:42,  9.40s/it][A[A[A


5it [00:50,  9.22s/it][A[A[A


6it [00:51,  6.74s/it][A[A[A


7it [00:52,  4.80s/it][A[A[A


8it [00:52,  3.43s/it][A[A[A


9it [00:53,  2.67s/it][A[A[A


10it [00:55,  2.41s/it][A[A[A


11it [01:07,  5.53s/it][A[A[A


12it [01:28,  9.97s/it][A[A[A


13it [01:30,  7.83s/it][A[A[A


14it [01:42,  9.06s/it][A[A[A


15it [01:48,  8.00s/it][A[A[A


16it [02:03, 10.24s/it][A[A[A


17it [02:08,  8.64s/it][A[A[A


18it [02:21,  9.80s/it][A[A[A


19it [02:26,  8.42s/it][A[A[A


20it [02:28,  6.57s/it][A[A[A


21it [02:29,  4.76s/it][A[A[A


22it [02:29,  3.39s/it][A[A[A


23it [02:29,  2.52s/it][A[A[A


24it [03:00, 11.03s/it][A[A[A


25it [03:18, 13.06s/it][A[A[A


26it [03:18,  9.22s/it][A[A[A


27it [03:23,  7.89s/it][A[A[A


28it [03:23,  5.60s/it][A[A[

In [113]:
articles, timestamps, tickers, prices = remove_empty_tickers(
    articles[:len(filtered_tickers)], timestamps[:len(filtered_tickers)], filtered_tickers, prices)
print(len(articles))

120


In [0]:
import pickle

data = {
    'articles': articles,
    'timestamps': timestamps,
    'tickers': tickers,
    'prices': prices
}

pickle.dump(data, open('data.pkl', 'wb'))

In [0]:
from google.colab import files

files.download('data.pkl')