In [21]:
import numpy as np
import pandas as pd
import lightgbm as lg
from tqdm import tqdm
import gc

In [22]:
# load data
df = pd.read_csv('../train.csv', index_col=0)
df_test = pd.read_csv('../test.csv', index_col=0)

In [23]:
def feature_engineering(data_frame):
    '''
    feature engineering function.
    
    DataFrame -> DataFrame
    '''
    # creating new features
    data_frame['new']  = data_frame['x3B'] - data_frame['x5']
    data_frame['new2']  = data_frame['x3C'] - data_frame['x4']
    data_frame['Day_group_10']  = np.digitize(data_frame.Day, list(range(0,730,10)))
    
    # scalling up "small" features
    small_features_1 = ['x0','x2',"x4"]
    small_features_2 = ["x3A",'x1', "x3B", "x3C", "x3D", "x3E", "x5", "new", "new2"]
    data_frame[small_features_1]= data_frame[small_features_1]*1000
    data_frame[small_features_2]= data_frame[small_features_2]*100000

In [24]:
feature_engineering(df)
feature_engineering(df_test)

In [25]:
# X_train and X_test 
X_train = df.drop(['y','Weight','Day'],1)
X_test = df_test.drop(['Day'],1)
Y = df.y

In [26]:
X_train.head()

Unnamed: 0_level_0,Market,Stock,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,x6,new,new2,Day_group_10
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1,363,2.696447,53004.016031,400.809729,0.130005,0.463953,1.073594,2.180516,4.416943,2.212166,0.062147,200.0,0.401806,-220.143023,1
1,1,1223,81.09336,136621.840843,1325.157127,1.656861,3.024668,5.261872,4.193762,4.193762,10.932,3.414868,300.0,-0.3902,-1087.938128,1
2,1,1372,9.561375,46487.097469,328.007564,0.006501,0.023232,0.051827,0.108083,0.203044,0.30592,0.004079,159.692076,0.019153,-30.54014,1
3,1,1942,45.300104,60822.21702,456.662307,0.001929,0.006495,0.013385,0.026442,0.069808,0.204603,0.00095,212.160378,0.005545,-20.446947,1
4,1,2712,1.605387,47671.768982,205.068902,0.067599,0.263878,0.642452,1.189923,2.06445,1.524508,0.034284,153.724351,0.229594,-151.808374,1


# Train a LightGBM model

In [None]:
# load data into train lightgbm dataset
# notice I'm scaling up the target, making first two columns as categorical features, and load weights
train = lg.Dataset(X_train, Y*10000, categorical_feature=[0, 1], weight=df.Weight, free_raw_data=False)

# hyperparameters for the model
parameters = {'num_leaves': 526, 
 'max_bin': 650, 'feature_fraction': '0.450', 
 'learning_rate': '0.009', 'reg_lambda': 3, 'bagging_freq': 2,
 'min_data_in_leaf': 142, 'colsample_bytree': '0.670', 
 'metric': 'rmse', 'verbose': 1}

boosts = 900
num_ensembles = 15
y_pred = 0.0

# average 15 different models 
for i in tqdm(range(num_ensembles)):
    parameters['seed'] = i * 2332
    model = lg.train(parameters, train_set=train, num_boost_round=boosts + i + 15) 
    y_pred +=  model.predict(data=X_test)
y_pred /= num_ensembles
gc.collect()

 47%|████▋     | 7/15 [24:36<28:07, 210.97s/it]

# Saving predictions 

In [14]:
yp = pd.Series(y_pred.flatten()/10000).rename('y')
yp.index.name = 'Index'
yp.head()

Index
0    0.000562
1   -0.000217
2   -0.000009
3    0.000029
4    0.000553
Name: y, dtype: float64

In [20]:
name = 'model_x'

In [18]:
yp.to_csv('../sub/{}.csv'.format(name), header=True)