In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
train_df = pd.read_csv('/mnt/c/Users/user/finam-x-hse-trade-ai-hack-forecast/data/raw/participants/candles.csv')
train_df['begin'] = pd.to_datetime(train_df['begin'])

test_df = pd.read_csv('/mnt/c/Users/user/finam-x-hse-trade-ai-hack-forecast/data/raw/participants/candles_2.csv')
test_df['begin'] = pd.to_datetime(test_df['begin'])

In [3]:
train_df.sort_values(['ticker', 'begin']).reset_index(drop=True, inplace=True)
train_df.drop_duplicates().reset_index(drop=True)

test_df.sort_values(['ticker', 'begin']).reset_index(drop=True, inplace=True)

In [4]:
def add_col(df, window_size):
    for ticker in df['ticker'].unique():
        mask = df['ticker'] == ticker
        ticker_data = df[mask].copy()
    
        # 1. Моментум = процентное изменение цены за window_size дней
        ticker_data['momentum'] = (
            ticker_data['close'].pct_change(window_size)
        )
    
        # 2. Волатильность = std доходностей за window_size дней
        ticker_data['volatility'] = (
            ticker_data['close'].pct_change().rolling(window_size).std()
        )
    
        # 3. Средняя цена за последние window_size дней
        ticker_data['ma'] = ticker_data['close'].rolling(window_size).mean()
    
        # 4. Расстояние от MA (нормализованное)
        ticker_data['distance_from_ma'] = (
            (ticker_data['close'] - ticker_data['ma']) / ticker_data['ma']
        )
    
        #5. Доходность за день
        ticker_data['profit_in_day'] = (ticker_data['close'] - ticker_data['open']) - ticker_data['open']
        
        #6. Диапазон волатильности за день
        ticker_data['money_range'] = (ticker_data['high'] - ticker_data['low']) / ticker_data['low']
        
        #7. Отношение high цены к close
        ticker_data['high/close'] = (ticker_data['high'] - ticker_data['close']) / ticker_data['close']
        
        #8. Отношение low цены к close
        ticker_data['low/close'] = (ticker_data['close'] - ticker_data['low']) / ticker_data['close']
        
        #9. Отношение volume к среднему volume за последние window_size дней
        ticker_data['volume/mean_volume'] = ticker_data['volume'] / ticker_data['volume'].rolling(window_size).mean()
        
        #10. Денежный поток
        ticker_data['money_flow'] = (2 * ticker_data['close'] - ticker_data['low'] - ticker_data['high']) / (
            ticker_data['high'] - ticker_data['low']) * ticker_data['volume']
        
        #11. RSI - измеряет силу восходящих и нисходящих движений за окно n
        delta = ticker_data['close'].diff()
        gain = delta.clip(lower=0)
        loss = -delta.clip(upper=0)
        avg_gain = gain.rolling(14).mean()
        avg_loss = loss.rolling(14).mean()
        rs = avg_gain / avg_loss
        try:
            ticker_data['RSI'] = 100 - (100 / (1 + rs))
        except:
            ticker_data['RSI'] = 100
        
        #12. MACD - индикатор, который показывает разницу между короткой и длинной скользящей экспоненциальной средней (EMA).
        ticker_data['MACD'] = ticker_data['close'].ewm(span=12, adjust=False).mean() - ticker_data['close'].ewm(span=26, adjust=False).mean()
        ticker_data['MACD_hist'] = ticker_data['MACD'] - ticker_data['MACD'].ewm(span=9, adjust=False).mean()
        
        #13. ATR измеряет волатильность инструмента — насколько сильно меняется цена за день.
        ticker_data['H-L'] = ticker_data['high'] - ticker_data['low']
        ticker_data['H-PC'] = (ticker_data['high'] - ticker_data['close'].shift(1)).abs()
        ticker_data['L-PC'] = (ticker_data['low'] - ticker_data['close'].shift(1)).abs()
        ticker_data['TR'] = ticker_data[['H-L', 'H-PC', 'L-PC']].max(axis=1)
        ticker_data['ATR'] = ticker_data['TR'].rolling(window=14).mean()
        ticker_data.drop(['H-L', 'H-PC', 'L-PC', 'TR'], axis=1, inplace=True)
        
        #14. Bollinger Bands показывают, насколько цена далеко от своего среднего значения.
        mean_20 = ticker_data['close'].rolling(20).mean()
        std_20 = ticker_data['close'].rolling(20).std()
        ticker_data['BB_upper'] = mean_20 + 2 * std_20
        ticker_data['BB_lower'] = mean_20 - 2 * std_20
        ticker_data['BB_pos'] = (
            (ticker_data['close'] - ticker_data['BB_lower']) /
            (ticker_data['BB_upper'] - ticker_data['BB_lower'])
        )
        
        #15. Лаги по цене закрытия
        ticker_data['close_lag1'] = ticker_data['close'].shift(1)
        ticker_data['close_lag2'] = ticker_data['close'].shift(2)
        
        #16. Лаги по доходности
        ticker_data['return_1d'] = ticker_data['close'].pct_change(1)
        ticker_data['return_lag1'] = ticker_data['return_1d'].shift(1)
        ticker_data['return_lag2'] = ticker_data['return_1d'].shift(2)
    
        df.loc[mask, 'momentum'] = ticker_data['momentum'].values
        df.loc[mask, 'volatility'] = ticker_data['volatility'].values
        df.loc[mask, 'ma'] = ticker_data['ma'].values
        df.loc[mask, 'distance_from_ma'] = ticker_data['distance_from_ma'].values
        df.loc[mask, 'profit_in_day'] = ticker_data['profit_in_day'].values
        df.loc[mask, 'money_range'] = ticker_data['money_range'].values
        df.loc[mask, 'high/close'] = ticker_data['high/close'].values
        df.loc[mask, 'low/close'] = ticker_data['low/close'].values
        df.loc[mask, 'volume/mean_volume'] = ticker_data['volume/mean_volume'].values
        df.loc[mask, 'money_flow'] = ticker_data['money_flow'].values
        df.loc[mask, 'RSI'] = ticker_data['RSI'].values
        df.loc[mask, 'MACD'] = ticker_data['MACD'].values
        df.loc[mask, 'MACD_hist'] = ticker_data['MACD_hist'].values
        df.loc[mask, 'ATR'] = ticker_data['ATR'].values
        df.loc[mask, 'BB_upper'] = ticker_data['BB_upper'].values
        df.loc[mask, 'BB_lower'] = ticker_data['BB_lower'].values
        df.loc[mask, 'BB_pos'] = ticker_data['BB_pos'].values
        df.loc[mask, 'close_lag1'] = ticker_data['close_lag1'].values
        df.loc[mask, 'close_lag2'] = ticker_data['close_lag2'].values
        df.loc[mask, 'return_1d'] = ticker_data['return_1d'].values
        df.loc[mask, 'return_lag1'] = ticker_data['return_lag1'].values
        df.loc[mask, 'return_lag2'] = ticker_data['return_lag2'].values
        
    df['momentum'].fillna(0, inplace=True)
    df['volatility'].fillna(0, inplace=True)
    df['ma'].fillna(0, inplace=True)
    df['distance_from_ma'].fillna(0, inplace=True)
    df['profit_in_day'].fillna(0, inplace=True)
    df['money_range'].fillna(0, inplace=True)
    df['high/close'].fillna(0, inplace=True)
    df['low/close'].fillna(0, inplace=True)
    df['volume/mean_volume'].fillna(0, inplace=True)
    df['money_flow'].fillna(0, inplace=True)
    df['RSI'].fillna(0, inplace=True)
    df['MACD'].fillna(0, inplace=True)
    df['MACD_hist'].fillna(0, inplace=True)
    df['ATR'].fillna(0, inplace=True)
    df['BB_upper'].fillna(0, inplace=True)
    df['BB_lower'].fillna(0, inplace=True)
    df['BB_pos'].fillna(0, inplace=True)
    df['close_lag1'].fillna(0, inplace=True)
    df['close_lag2'].fillna(0, inplace=True)
    df['return_1d'].fillna(0, inplace=True)
    df['return_lag1'].fillna(0, inplace=True)
    df['return_lag2'].fillna(0, inplace=True)
    df['ticker_code'] = df['ticker'].astype('category').cat.codes

    return df

In [5]:
train_df_dob = add_col(train_df.copy(), 5)
test_df_dob = add_col(test_df.copy(), 5)
train_df_end = train_df_dob.drop(['begin', 'ticker'], axis=1)
test_df_end = test_df_dob.drop(['begin', 'ticker'], axis=1)

In [20]:
train_df_end['target_return_1d'] = train_df_end['return_1d'].shift(-1)
train_df_end['target_return_20d'] = (train_df_end['close'].shift(-20) - train_df_end['close']) / train_df_end['close']
train_df_end['target_direction_1d'] = (train_df_end['target_return_1d'] > 0).astype('int')
train_df_end['target_direction_20d'] = (train_df_end['target_return_20d'] > 0).astype('int')
train_df_end.head(25)

Unnamed: 0,open,close,high,low,volume,momentum,volatility,ma,distance_from_ma,profit_in_day,...,close_lag1,close_lag2,return_1d,return_lag1,return_lag2,ticker_code,target_return_1d,target_return_20d,target_direction_1d,target_direction_20d
0,81.5,81.7,83.2,81.16,29755530,0.0,0.0,0.0,0.0,-81.3,...,0.0,0.0,0.0,0.0,0.0,0,0.004896,0.071726,1,1
1,81.72,82.1,83.98,80.26,18502950,0.0,0.0,0.0,0.0,-81.34,...,81.7,0.0,0.004896,0.0,0.0,0,-0.010962,0.048477,0,1
2,82.04,81.2,82.48,80.4,16848930,0.0,0.0,0.0,0.0,-82.88,...,82.1,81.7,-0.010962,0.004896,0.0,0,-0.007635,0.048522,0,1
3,79.78,80.58,80.8,78.22,21559860,0.0,0.0,0.0,0.0,-78.98,...,81.2,82.1,-0.007635,-0.010962,0.004896,0,-0.014892,0.09655,0,1
4,80.5,79.38,81.44,78.76,14677280,0.0,0.0,80.992,-0.019903,-81.62,...,80.58,81.2,-0.014892,-0.007635,-0.010962,0,0.044344,0.104056,1,1
5,79.0,82.9,83.0,77.44,37790740,0.014688,0.024192,81.232,0.020534,-75.1,...,79.38,80.58,0.044344,-0.014892,-0.007635,0,-0.006755,0.010133,0,1
6,82.82,82.34,83.62,80.98,23644570,0.002923,0.02454,81.28,0.013041,-83.3,...,82.9,79.38,-0.006755,0.044344,-0.014892,0,-0.008744,0.014088,0,1
7,82.36,81.62,84.22,81.4,18886540,0.005172,0.024293,81.364,0.003146,-83.1,...,82.34,82.9,-0.008744,-0.006755,0.044344,0,-0.007841,0.011272,0,1
8,80.9,80.98,81.38,79.56,19891100,0.004964,0.024312,81.444,-0.005697,-80.82,...,81.62,82.34,-0.007841,-0.008744,-0.006755,0,0.003211,0.027414,1,1
9,81.36,81.24,82.64,80.5,18364020,0.023432,0.0226,81.816,-0.00704,-81.48,...,80.98,81.62,0.003211,-0.007841,-0.008744,0,-0.011817,0.03225,0,1


In [6]:
scaler = StandardScaler()
scaler.fit(

SyntaxError: unexpected EOF while parsing (2395914310.py, line 2)