In [1]:
from google.colab import drive; drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install --upgrade --force-reinstall --no-deps kaggle > log  # upgrade kaggle package (to avoid a warning)
!mkdir -p ~/.kaggle                                           # .kaggle folder must contain kaggle.json for kaggle executable to properly authenticate you to Kaggle.com
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json >log  # First, download kaggle.json from kaggle.com (in Account page) and place it in the root of mounted Google Drive
#!cp kaggle.json ~/.kaggle/kaggle.json > log                   # Alternative location of kaggle.json (without a connection to Google Drive)
!chmod 600 ~/.kaggle/kaggle.json                              # give only the owner full read/write access to kaggle.json
!kaggle config set -n competition -v optiver-trading-at-the-close        # set the competition context for the next few kaggle API calls. !kaggle config view - shows current settings
!kaggle competitions download >> log                          # download competition dataset as a zip file
!unzip -o *.zip >> log                                        # Kaggle dataset is copied as a single file and needs to be unzipped.
# !kaggle competitions leaderboard --show                       # print public leaderboard

- competition is now set to: optiver-trading-at-the-close
100% 201M/201M [00:07<00:00, 27.1MB/s]


In [3]:
%%time
%%capture
%reset -f
import numpy as np, pandas as pd, time, matplotlib.pyplot as plt, seaborn as sns, os, tqdm, re, sys, cv2, skimage, xgboost, lightgbm as lgb, librosa

ToCSV = lambda df, fname: df.round(2).to_csv(f'{fname}.csv', index_label='id') # rounds values to 2 decimals

class Timer():
  def __init__(self, lim:'RunTimeLimit'=60): self.t0, self.lim, _ = time.time(), lim, print(f'⏳ started. You have {lim} sec. Good luck!')
  def ShowTime(self):
    msg = f'Runtime is {time.time()-self.t0:.0f} sec'
    print(f'\033[91m\033[1m' + msg + f' > {self.lim} sec limit!!!\033[0m' if (time.time()-self.t0-1) > self.lim else msg)

np.set_printoptions(linewidth=100, precision=2, edgeitems=2, suppress=True)
pd.set_option('display.max_columns', 20, 'display.precision', 2, 'display.max_rows', 4)

CPU times: user 2.22 s, sys: 343 ms, total: 2.56 s
Wall time: 4.14 s


In [4]:
def is_google_colab():
    """Check if the environment is Google Colab.

    Returns:
        bool: True if in Google Colab, False otherwise.
    """
    try:
        import google.colab
        return True
    except ImportError:
      return False

if is_google_colab():
  file_path = ''
else:
  file_path = '/kaggle/input/optiver-trading-at-the-close/'

In [5]:
sample_submission_path = file_path + 'example_test_files/sample_submission.csv'

sample_submission = pd.read_csv(sample_submission_path); sample_submission

Unnamed: 0,time_id,row_id,target
0,26290,478_0_0,1
1,26290,478_0_1,1
...,...,...,...
32998,26454,480_540_198,1
32999,26454,480_540_199,1


# Load Data

In [6]:
tmr = Timer()

⏳ started. You have 60 sec. Good luck!


In [7]:
train_data_path = file_path + 'train.csv'
df = pd.read_csv(train_data_path);
df = df.dropna(subset=['target'])
df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3.18e+06,1,1.0,1.34e+07,,,1.0,60651.50,1.0,8493.03,1.0,-3.03,0,0_0_0
1,1,0,0,1.67e+05,-1,1.0,1.64e+06,,,1.0,3233.04,1.0,20605.09,1.0,-5.52,0,0_0_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237978,198,480,540,1.00e+06,1,1.0,9.48e+07,1.0,1.0,1.0,125631.72,1.0,669893.00,1.0,-1.54,26454,480_540_198
5237979,199,480,540,1.88e+06,-1,1.0,2.41e+07,1.0,1.0,1.0,250081.44,1.0,300167.56,1.0,-6.53,26454,480_540_199


In [8]:
test_data_path = file_path + 'example_test_files/test.csv'
df_test  = pd.read_csv(test_data_path); df_test

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,time_id,row_id
0,0,478,0,3.75e+06,-1,1.0,1.15e+07,,,1.0,22940.00,1.0,9177.60,1.0,26290,478_0_0
1,1,478,0,9.86e+05,-1,1.0,3.85e+06,,,1.0,1967.90,1.0,19692.00,1.0,26290,478_0_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32998,198,480,540,1.00e+06,1,1.0,9.48e+07,1.0,1.0,1.0,125631.72,1.0,669893.00,1.0,26454,480_540_198
32999,199,480,540,1.88e+06,-1,1.0,2.41e+07,1.0,1.0,1.0,250081.44,1.0,300167.56,1.0,26454,480_540_199


In [9]:
print('Shape of training data =', df.shape)
print('Shape of testing data  =', df_test.shape)

Shape of training data = (5237892, 17)
Shape of testing data  = (33000, 16)


In [10]:
df['target'].info()

<class 'pandas.core.series.Series'>
Int64Index: 5237892 entries, 0 to 5237979
Series name: target
Non-Null Count    Dtype  
--------------    -----  
5237892 non-null  float64
dtypes: float64(1)
memory usage: 79.9 MB


In [11]:
n_val = 50000

tX, tY = df.drop('target', axis=1), df.target    # full training set

if is_google_colab():
  X_train = tX[:-n_val]
  y_train = tY[:-n_val]

  X_test_validation = tX[:n_val]
  y_test_validation = tY[:n_val]

else:
  X_train = tX
  y_train = tY

# Preprocessing

In [12]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin

In [13]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Only drop columns that exist in X
        columns_to_drop = [col for col in self.columns_to_drop if col in X.columns]
        return X.drop(columns_to_drop, axis=1)


In [14]:
# Set up preprocessing pipeline
preprocess_steps = [
  ('drop_columns', DropColumns(['near_price','far_price','time_id'])),
  ('imputer', SimpleImputer(strategy='median')),
  # ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
  ('scaler', StandardScaler()),
]
pipeline = Pipeline(preprocess_steps)

X_train_processed = pipeline.fit_transform(X_train)

### Model Training

In [15]:
lr = LinearRegression()
lr.fit(X_train_processed, y_train)

# Make submission

In [16]:
from sklearn.metrics import mean_absolute_error

if is_google_colab():
  # Predict the target values
  preds = pd.DataFrame(lr.predict(pipeline.fit_transform(X_test_validation)))

  # Reset the index for all dataframes involved in the concatenation
  X_test_validation_reset = X_test_validation[['time_id', 'row_id']].reset_index(drop=True)
  preds_reset = preds.reset_index(drop=True)
  y_test_validation_reset = y_test_validation.reset_index(drop=True)

  # Concatenate the reset dataframes
  submission = pd.concat([X_test_validation_reset, preds_reset, y_test_validation_reset], axis=1)
  submission.columns = ['time_id', 'row_id', 'predicted_target', 'true_target']

  # Drop any rows with NaN values in 'true_target' or 'predicted_target'
  submission = submission.dropna(subset=['true_target', 'predicted_target'])

  # Calculate MAE
  mae = mean_absolute_error(submission['true_target'], submission['predicted_target'])

  # Print MAE
  print("Mean Absolute Error:", mae)

  # Return submission DataFrame
  submission


Mean Absolute Error: 5.571058361137725


In [17]:
if not is_google_colab():
  import optiver2023
  env = optiver2023.make_env()
  iter_test = env.iter_test()

In [18]:
if not is_google_colab():
  counter = 0
  for (test, revealed_targets, sample_prediction) in iter_test:
      sample_prediction['target'] = lr.predict(pipeline.fit_transform(test))
      env.predict(sample_prediction)
      counter += 1

In [19]:
tmr.ShowTime()    # measure Colab's runtime. Do not remove. Keep as the last cell in your notebook.

[91m[1mRuntime is 71 sec > 60 sec limit!!![0m
