In [1]:
#Импорт библиотек
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [7]:
#Чтение данных и преобразование столбца с датой в datetime
train_df = pd.read_csv('../data/raw/participants/candles.csv')
train_df['begin'] = pd.to_datetime(train_df['begin'])

test_df = pd.read_csv('../data/raw/participants/candles_2.csv')
test_df['begin'] = pd.to_datetime(test_df['begin'])

df_long = pd.read_csv("../data/raw/participants/news_sort_train.csv")
df_long["date"] = pd.to_datetime(df_long["date"], errors="coerce").dt.date

In [None]:
#Сортировка данных по тикету и дате
train_df.sort_values(['ticker', 'begin']).reset_index(drop=True, inplace=True)
train_df.drop_duplicates().reset_index(drop=True)
test_df.sort_values(['ticker', 'begin']).reset_index(drop=True, inplace=True)

#Сортировка и группировка данных по дате и тикеру
df_long["ticker"] = df_long["ticker"].astype(str).str.upper()
df_long = df_long.sort_values(["ticker", "date"])
news_daily = (
    df_long.groupby(["date", "ticker"], as_index=False)
           .agg(
               sent_mean=("sentiment", "mean"),
               sent_last=("sentiment", "last"),
               news_n   =("sentiment", "size"),
           )
)

In [None]:
def add_col(df, window_size):
    for ticker in df['ticker'].unique():
        mask = df['ticker'] == ticker
        ticker_data = df[mask].copy()
    
        # 1. Моментум = процентное изменение цены за window_size дней
        ticker_data['momentum'] = (
            ticker_data['close'].pct_change(window_size)
        )
    
        # 2. Волатильность = std доходностей за window_size дней
        ticker_data['volatility'] = (
            ticker_data['close'].pct_change().rolling(window_size).std()
        )
    
        # 3. Средняя цена за последние window_size дней
        ticker_data['ma'] = ticker_data['close'].rolling(window_size).mean()
    
        # 4. Расстояние от MA (нормализованное)
        ticker_data['distance_from_ma'] = (
            (ticker_data['close'] - ticker_data['ma']) / ticker_data['ma']
        )
    
        #5. Доходность за день
        ticker_data['profit_in_day'] = (ticker_data['close'] - ticker_data['open']) - ticker_data['open']
        
        #6. Диапазон волатильности за день
        ticker_data['money_range'] = (ticker_data['high'] - ticker_data['low']) / ticker_data['low']
        
        #7. Отношение high цены к close
        ticker_data['high/close'] = (ticker_data['high'] - ticker_data['close']) / ticker_data['close']
        
        #8. Отношение low цены к close
        ticker_data['low/close'] = (ticker_data['close'] - ticker_data['low']) / ticker_data['close']
        
        #9. Отношение volume к среднему volume за последние window_size дней
        ticker_data['volume/mean_volume'] = ticker_data['volume'] / ticker_data['volume'].rolling(window_size).mean()
        
        #10. Денежный поток
        ticker_data['money_flow'] = (2 * ticker_data['close'] - ticker_data['low'] - ticker_data['high']) / (
            ticker_data['high'] - ticker_data['low']) * ticker_data['volume']
        
        #11. RSI - измеряет силу восходящих и нисходящих движений за окно n
        delta = ticker_data['close'].diff()
        gain = delta.clip(lower=0)
        loss = -delta.clip(upper=0)
        avg_gain = gain.rolling(14).mean()
        avg_loss = loss.rolling(14).mean()
        rs = avg_gain / avg_loss
        try:
            ticker_data['RSI'] = 100 - (100 / (1 + rs))
        except:
            ticker_data['RSI'] = 100
        
        #12. MACD - индикатор, который показывает разницу между короткой и длинной скользящей экспоненциальной средней (EMA).
        ticker_data['MACD'] = ticker_data['close'].ewm(span=12, adjust=False).mean() - ticker_data['close'].ewm(span=26, adjust=False).mean()
        ticker_data['MACD_hist'] = ticker_data['MACD'] - ticker_data['MACD'].ewm(span=9, adjust=False).mean()
        
        #13. ATR измеряет волатильность инструмента — насколько сильно меняется цена за день.
        ticker_data['H-L'] = ticker_data['high'] - ticker_data['low']
        ticker_data['H-PC'] = (ticker_data['high'] - ticker_data['close'].shift(1)).abs()
        ticker_data['L-PC'] = (ticker_data['low'] - ticker_data['close'].shift(1)).abs()
        ticker_data['TR'] = ticker_data[['H-L', 'H-PC', 'L-PC']].max(axis=1)
        ticker_data['ATR'] = ticker_data['TR'].rolling(window=14).mean()
        ticker_data.drop(['H-L', 'H-PC', 'L-PC', 'TR'], axis=1, inplace=True)
        
        #14. Bollinger Bands показывают, насколько цена далеко от своего среднего значения.
        mean_20 = ticker_data['close'].rolling(20).mean()
        std_20 = ticker_data['close'].rolling(20).std()
        ticker_data['BB_upper'] = mean_20 + 2 * std_20
        ticker_data['BB_lower'] = mean_20 - 2 * std_20
        ticker_data['BB_pos'] = (
            (ticker_data['close'] - ticker_data['BB_lower']) /
            (ticker_data['BB_upper'] - ticker_data['BB_lower'])
        )
        
        #15. Лаги по цене закрытия
        ticker_data['close_lag1'] = ticker_data['close'].shift(1)
        ticker_data['close_lag2'] = ticker_data['close'].shift(2)
        
        #16. Лаги по доходности
        ticker_data['return_1d'] = ticker_data['close'].pct_change(1)
        ticker_data['return_lag1'] = ticker_data['return_1d'].shift(1)
        ticker_data['return_lag2'] = ticker_data['return_1d'].shift(2)
    
        df.loc[mask, 'momentum'] = ticker_data['momentum'].values
        df.loc[mask, 'volatility'] = ticker_data['volatility'].values
        df.loc[mask, 'ma'] = ticker_data['ma'].values
        df.loc[mask, 'distance_from_ma'] = ticker_data['distance_from_ma'].values
        df.loc[mask, 'profit_in_day'] = ticker_data['profit_in_day'].values
        df.loc[mask, 'money_range'] = ticker_data['money_range'].values
        df.loc[mask, 'high/close'] = ticker_data['high/close'].values
        df.loc[mask, 'low/close'] = ticker_data['low/close'].values
        df.loc[mask, 'volume/mean_volume'] = ticker_data['volume/mean_volume'].values
        df.loc[mask, 'money_flow'] = ticker_data['money_flow'].values
        df.loc[mask, 'RSI'] = ticker_data['RSI'].values
        df.loc[mask, 'MACD'] = ticker_data['MACD'].values
        df.loc[mask, 'MACD_hist'] = ticker_data['MACD_hist'].values
        df.loc[mask, 'ATR'] = ticker_data['ATR'].values
        df.loc[mask, 'BB_upper'] = ticker_data['BB_upper'].values
        df.loc[mask, 'BB_lower'] = ticker_data['BB_lower'].values
        df.loc[mask, 'BB_pos'] = ticker_data['BB_pos'].values
        df.loc[mask, 'close_lag1'] = ticker_data['close_lag1'].values
        df.loc[mask, 'close_lag2'] = ticker_data['close_lag2'].values
        df.loc[mask, 'return_1d'] = ticker_data['return_1d'].values
        df.loc[mask, 'return_lag1'] = ticker_data['return_lag1'].values
        df.loc[mask, 'return_lag2'] = ticker_data['return_lag2'].values
        
    df['momentum'].fillna(0, inplace=True)
    df['volatility'].fillna(0, inplace=True)
    df['ma'].fillna(0, inplace=True)
    df['distance_from_ma'].fillna(0, inplace=True)
    df['profit_in_day'].fillna(0, inplace=True)
    df['money_range'].fillna(0, inplace=True)
    df['high/close'].fillna(0, inplace=True)
    df['low/close'].fillna(0, inplace=True)
    df['volume/mean_volume'].fillna(0, inplace=True)
    df['money_flow'].fillna(0, inplace=True)
    df['RSI'].fillna(0, inplace=True)
    df['MACD'].fillna(0, inplace=True)
    df['MACD_hist'].fillna(0, inplace=True)
    df['ATR'].fillna(0, inplace=True)
    df['BB_upper'].fillna(0, inplace=True)
    df['BB_lower'].fillna(0, inplace=True)
    df['BB_pos'].fillna(0, inplace=True)
    df['close_lag1'].fillna(0, inplace=True)
    df['close_lag2'].fillna(0, inplace=True)
    df['return_1d'].fillna(0, inplace=True)
    df['return_lag1'].fillna(0, inplace=True)
    df['return_lag2'].fillna(0, inplace=True)
    df['ticker_code'] = df['ticker'].astype('category').cat.codes

    return df

In [None]:
#Добавление дополнительных колонок в датасет candles для улучшения предсказаний с помощью функции add_col
train_df_dob = add_col(train_df.copy(), 5)
test_df_dob = add_col(test_df.copy(), 5)

In [None]:
#Подготовка данных к слиянию в единый датасет
def join_df(df_candles, news_daily):
    df_candles = train_df_dob.copy()
    df_candles["begin"] = pd.to_datetime(df_candles["begin"], errors="coerce")
    df_candles["date"]  = df_candles["begin"].dt.date
    df_candles["ticker"] = df_candles["ticker"].astype(str).str.upper()
    
    #Слияние данных в единый датасет по дате и тикеру
    out = df_candles.merge(news_daily, on=["date","ticker"], how="left", indicator=True)
    
    #Вывод доли свечей без новостей после слияния
    miss_share = (out["_merge"] == "left_only").mean()
    print(f"Доля строк свечей без новостей после джойна: {miss_share:.2%}")
    
    #Удаление лишних колонок и заполнение пропусков
    out = (
        out.drop(columns=["_merge"])
           .assign(
               sent_mean=lambda d: d["sent_mean"].fillna(0.5), 
               sent_last=lambda d: d["sent_last"].fillna(0.5),
               news_n   =lambda d: d["news_n"].fillna(0).astype(int),
           )
           .sort_values(["ticker", "begin"])
           .reset_index(drop=True)
    )
    return out

In [None]:
#Слияние
train_df_end = join_df(train_df_dob, news_daily_train)
train_df_dob.drop(['begin', 'date', 'ticker'], axis=1, inplace=True)
test_df_end  = join_df(test_df_dob,  news_daily_test)
test_df_end.drop(['begin', 'date', 'ticker'], axis=1, inplace=True)

In [None]:
#Создание отдельного датафрейма с целевыми признаками
def to_target_df(df):
    df['target_return_1d'] = df['return_1d'].shift(-1)
    df['target_return_20d'] = (df['close'].shift(-20) - df['close']) / df['close']
    df['target_direction_1d'] = (df['target_return_1d'] > 0).astype('int')
    df['target_direction_20d'] = (df['target_return_20d'] > 0).astype('int')
    y_df = df[['target_return_1d', 'target_return_20d', 'target_direction_1d', 'target_direction_20d']]
    return y_df
    
y_train_data = to_target_df(train_df_end.copy())
x_train_data = train_df_end.copy()

y_test_df_test = to_target_df(test_df_end.copy())
x_test_df_test = test_df_end.copy()

In [None]:
#Масштабирование данных
from sklearn.preprocessing import StandardScaler

def drop_targets(df):
    cols = [c for c in df.columns if not c.startswith('target_')]
    return df[cols]

scaler = StandardScaler()
scaler.fit(x_train_data.drop('ticker_code', axis=1))

x_train_data_scaled = pd.DataFrame(scaler.transform(x_train_data.drop('ticker_code', axis=1)), columns=x_train_data.drop('ticker_code', axis=1).columns)
x_train_data_scaled['ticker_code'] = x_train_data['ticker_code'].values

x_test_df_test_scaled = pd.DataFrame(scaler.transform(x_test_df_test.drop('ticker_code', axis=1)), columns=x_test_df_test.drop('ticker_code', axis=1).columns)
x_test_df_test_scaled['ticker_code'] = x_test_df_test['ticker_code'].values

In [None]:
#Разделение данных на тренировочную, валидационную и тестовую выборки
def tt_split(X, y):
    X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25, random_state=42)

    return X_train, X_valid, X_test, y_train, y_valid, y_test
    
X_train_df, X_valid_df, X_test_df, y_train_df, y_valid_df, y_test_df = tt_split(x_train_data, y_train_data)

X_train_df_scaled, X_valid_df_scaled, X_test_df_scaled, y_train_df_scaled, y_valid_df_scaled, y_test_df_scaled = tt_split(
    x_train_data_scaled, y_train_data)