In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys, warnings
from feature_engineer import *

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [4]:
df = pd.read_csv("/home/lishi/projects/Competition/data/train.csv")
df = df[~df['target'].isnull()] 

print(df.shape)
print(f"Trading days: {df['date_id'].nunique()}")
print(f"Stocks: {df['stock_id'].nunique()}")

df.head()

(5237892, 17)
Trading days: 481
Stocks: 200


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [5]:
df.tail()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
5237975,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,32257.04,1.000434,319862.4,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,205108.4,1.0009,93393.07,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.0,0,0.995789,12725436.1,0.995789,0.995789,0.995789,16790.66,0.995883,180038.32,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1000898.84,1,0.99921,94773271.05,0.99921,0.99921,0.99897,125631.72,0.99921,669893.0,0.999008,-1.540184,26454,480_540_198
5237979,199,480,540,1884285.71,-1,1.002129,24073677.32,1.000859,1.001494,1.002129,250081.44,1.002447,300167.56,1.002274,-6.530285,26454,480_540_199


In [None]:
split_day = 435
df_train = df[df["date_id"] <= split_day]
df_valid = df[df["date_id"] > split_day]
print(f"train : {df_train.shape}, valid : {df_valid.shape}")

In [None]:
global_stock_id_feats = {
    "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
    "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
    "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
    "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
    "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
    "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    }


df_train_feats = generate_all_features(df_train, global_stock_id_feats)
print("Build Train Feats Finished.")
df_valid_feats = generate_all_features(df_valid, global_stock_id_feats)
print("Build Valid Feats Finished.")

df_valid_feats = reduce_mem_usage(df_valid_feats)
df_train_feats = reduce_mem_usage(df_train_feats)

In [None]:
df_train_feats.to_csv("train_feats.csv", index=False)
df_valid_feats.to_csv("valid_feats.csv", index=False)

In [None]:
df_train_feats.head()

In [None]:
# create a Fully Connected Neural Network Model with 2 hidden layers
# first layer is a non-linear layer with 60 neurons
# second layer is a linear layer with 1 neuron



class StockDataset(Dataset):
    def __init__(self, df, feats, target):
        self.feats = feats
        self.target = target
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        feats = torch.tensor(self.feats[idx], dtype=torch.float)
        target = torch.tensor(self.target[idx], dtype=torch.float)
        return feats, target
    
class StockModel(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(StockModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
def train(model, train_loader, optimizer, criterion):
    model.train()
    train_loss = 0
    for feats, target in train_loader:
        feats = feats.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        output = model(feats)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    return train_loss / len(train_loader)

def valid(model, valid_loader, criterion):
    model.eval()
    valid_loss = 0
    preds = []
    targets = []
    with torch.no_grad():
        for feats, target in valid_loader:
            feats = feats.to(device)
            target = target.to(device)
            output = model(feats)
            loss = criterion(output, target)
            valid_loss += loss.item()
            preds.append(output.cpu().numpy())
            targets.append(target.cpu().numpy())
    preds = np.concatenate(preds)
    targets = np.concatenate(targets)
    return valid_loss / len(valid_loader), preds, targets

def train_loop(model, train_loader, valid_loader, optimizer, criterion, epochs):
    best_loss = np.inf
    for epoch in range(epochs):
        train_loss = train(model, train_loader, optimizer, criterion)
        valid_loss, preds, targets = valid(model, valid_loader, criterion)
        rmse = np.sqrt(mean_squared_error(targets, preds))
        print(f"Epoch {epoch+1} - train_loss: {train_loss:.4f}  valid_loss: {valid_loss:.4f}  rmse: {rmse:.4f}")
        if valid_loss < best_loss:
            best_loss = valid_loss
            torch.save(model.state_dict(), "model.pt")
            print("Save Model.")

# df_train_feats = pd.read_csv("train_feats.csv")
# df_train_feats = df_train_feats[~df_train_feats['target'].isnull()]
# df_train_feats = df_train_feats.sample(frac=1, random_state=42).reset_index(drop=True)

# df_train_feats.head()

# df_valid_feats = pd.read_csv("valid_feats.csv")

# df_valid_feats = df_valid_feats[~df_valid_feats['target'].isnull()]

# df_valid_feats = df_valid_feats.sample(frac=1, random_state=42).reset_index(drop=True)

# df_valid_feats.head()




In [None]:
feats = [col for col in df_train_feats.columns if col not in ["target", "date_id", "stock_id"]]

train_feats = df_train_feats[feats]
valid_feats = df_valid_feats[feats]

train_target = df_train_feats["target"]
valid_target = df_valid_feats["target"]

train_feats = train_feats.fillna(0)
valid_feats = valid_feats.fillna(0)

In [None]:
scaler_train = StandardScaler()
scaler_train.fit(train_feats)
train_feats = scaler_train.transform(train_feats)

scaler_valid = StandardScaler()
scaler_valid.fit(valid_feats)
valid_feats = scaler_valid.transform(valid_feats)

train_dataset = StockDataset()


df_valid_feats[feats] = scaler.transform(df_valid_feats[feats])

valid_dataset = StockDataset(df_valid_feats, df_valid_feats[feats].values, df_valid_feats["target"].values)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

valid_loader = DataLoader(valid_dataset, batch_size=1024, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = StockModel(len(feats), 60).to(device)

optimizer = Adam(model.parameters(), lr=0.001)

criterion = nn.MSELoss()

train_loop(model, train_loader, valid_loader, optimizer, criterion, epochs=100)

df_valid_feats = pd.read_csv("valid_feats.csv")

df_valid_feats = df_valid_feats[~df_valid_feats['target'].isnull()]

df_valid_feats = df_valid_feats.sample(frac=1, random_state=42).reset_index(drop=True)

df_valid_feats.head()

feats = [col for col in df_valid_feats.columns if col not in ["target", "date_id", "stock_id"]]

target = df_valid_feats["target"].values

scaler = StandardScaler()

scaler.fit(df_valid_feats[feats])

df_valid_feats[feats] = scaler.transform(df_valid_feats[feats])

valid_dataset = StockDataset(df_valid_feats, df_valid_feats[feats].values, df_valid_feats["target"].values)

valid_loader = DataLoader(valid_dataset, batch_size=1024, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = StockModel(len(feats), 60).to(device)

model.load_state_dict(torch.load("model.pt"))

model.eval()

valid_loss, preds, targets = valid(model, valid_loader, criterion)

rmse = np.sqrt(mean_squared_error(targets, preds))

print(f"valid_loss: {valid_loss:.4f}  rmse: {rmse:.4f}")

In [None]:
group = df.groupby(['date_id', 'stock_id'])

# calculate wap change per stock per day
df['wap_chg_1'] = group['wap'].diff() / group['wap'].shift(1) * 100
df['imb_size_change'] = group['imb_size_with_flag'].diff()

In [None]:
to_hist = df[(df.stock_id==0)]['imb_size_change']
to_hist = to_hist[(~to_hist.isnull()) & (to_hist != np.inf) & (to_hist != -np.inf)]
plt.hist(to_hist,  bins=100)
plt.show()

In [None]:
# plot wap change v.s. imbalance size change
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111)
sub_df = df[(df.stock_id==0)&(df.date_id==0)]
ax.scatter(sub_df['imb_size_change'], sub_df['wap_chg_1'], s=1)
ax.set_xlabel('Imbalance Size Change (%)')
ax.set_ylabel('WAP Change (%)')
ax.set_title('WAP Change v.s. Imbalance Size Change')
# ax.set_xlim([-100, 100])
plt.show()

In [None]:
import pandas_market_calendars as mcal

four_witches = [
    '20210917', '20211217',
    '20220318', '20220617', '20220916', '20221216', 
    '20230317', '20230616', '20230915', '20231215'
    ]

fomc_dates = [
    '20210827', '20211103', '20211215',
    '20220126', '20220316', '20220427', '20220615', '20220727', '20220921', '20221102', '20221214',
    '20230125', '20230315', '20230426', '20230614', '20230726', '20230920', '20231101', '20231213'
    ]

# start date is 2021-08-02 in New York time
start_date = pd.to_datetime('20210802', format='%Y%m%d')
start_date = start_date.tz_localize('America/New_York')

# count trading days using pandas_market_calendars
nyse = mcal.get_calendar('NYSE')
trading_days = nyse.schedule(start_date=start_date, end_date='20231231')
trading_days = trading_days.reset_index()
trading_days['date_id'] = trading_days['market_close'].dt.strftime('%Y%m%d')
trading_days = trading_days[['date_id', 'market_close']]

trading_days['four_witches'] = trading_days['date_id'].isin(four_witches)
trading_days['fomc'] = trading_days['date_id'].isin(fomc_dates)

trading_days['days_count'] = trading_days.index

four_witches_days = trading_days[trading_days['four_witches']]
fomc_days = trading_days[trading_days['fomc']]