<a href="https://colab.research.google.com/github/shlear/MLDM-2022/blob/main/exam-project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np                                  
import pandas as pd                                  
from scipy.stats import pearsonr                     
import lightgbm as lgb                               

In [2]:
from google.colab import files 
files.upload()                 

Saving kaggle.json to kaggle (6).json


{'kaggle.json': b'{"username":"egorbevz","key":"60602a5f249baeadc5d7ef09fdbe23af"}'}

In [3]:
! pip install -q kaggle                                                                  
! cp kaggle.json ~/.kaggle/                                                              
! chmod 600 ~/.kaggle/kaggle.json                                                        
! kaggle datasets download robikscube/ubiquant-parquet -f train_low_mem.parquet --path . 
! unzip train_low_mem.parquet.zip                                                        

train_low_mem.parquet.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  train_low_mem.parquet.zip
replace train_low_mem.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train_low_mem.parquet   


# Preparing Data

In [4]:
# train_low_mem.parquet is a pretty big file, so we are going to use:
# * k features
# * n data points for model training
# * the last n points for model evaluation

k = 150
n = 1000000
features = [f'f_{i}' for i in np.random.choice(300, size=k, replace=False)]                   # select randomly k columns as features

train = pd.read_parquet(
    'train_low_mem.parquet',
    columns=features+['target']
    )[-2*n:-n]                                             # Load train data (n lines before the last n lines), load only those columns listed in features and the `target`
y_train = train['target']                # Set the column `target` as a target array for model fitting
X_train = train.drop(columns=['target']) # Drop the column `target` from train, so X_train contains only features for model fitting
display(X_train, y_train)                  # Display the features table (X_train) and the target array (y_train)


Unnamed: 0,f_210,f_190,f_150,f_164,f_68,f_88,f_61,f_109,f_252,f_59,...,f_238,f_295,f_251,f_135,f_201,f_153,f_250,f_247,f_104,f_231
1141410,-0.410345,1.170076,-1.296607,-0.801438,3.592265,-2.734037,-0.641886,-1.090109,1.031235,1.061435,...,0.598805,0.225239,0.854028,1.268696,-0.745468,0.089604,-0.739092,1.414989,0.124214,-0.257197
1141411,-0.435393,-0.700612,0.323990,-0.058650,-0.715890,1.006685,-0.305377,0.103530,-0.250939,-0.394424,...,-0.349036,-0.515850,-0.353739,-0.486068,0.353682,0.089604,-0.581432,-0.644428,0.124214,-0.257197
1141412,-0.313765,-0.424364,0.323990,1.297214,1.591063,-0.774341,0.672532,-0.438877,1.176513,-0.180308,...,0.243001,-0.737802,-0.310315,0.287409,-0.219750,0.089604,-0.652053,0.119740,0.124214,-0.257173
1141413,-0.428634,-0.209865,0.323990,-0.041518,-0.982865,1.099095,-0.210431,0.489880,-0.993321,-0.299045,...,0.253283,-0.426791,-0.382667,0.192608,0.465238,0.089604,-0.677771,-0.158316,0.124214,-0.257197
1141414,1.496469,0.915616,-0.756408,0.555650,2.369328,-1.398891,0.276653,-1.336575,1.669657,1.143206,...,0.858856,0.612243,2.626944,1.654179,1.762746,0.089604,0.154147,1.358749,0.124214,-0.257197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2141405,-0.203689,0.014614,0.043331,0.386486,-0.481815,1.163968,0.309166,0.644874,-0.416025,0.330351,...,0.691137,0.971897,0.392809,-1.283602,-0.431639,0.048979,-0.069494,0.637292,-1.165999,-0.724779
2141406,-0.365350,1.251384,0.888536,0.231324,-0.579768,-1.307315,0.026331,0.184671,0.312491,-0.220146,...,1.055939,0.010283,-0.901246,0.211223,0.063933,0.053202,0.267369,0.470939,1.730645,-0.728233
2141407,-0.445958,-0.339641,0.769395,-1.493630,0.336030,0.673196,-1.370659,0.053993,0.443358,0.215473,...,-1.285350,0.549901,-0.113562,-0.637839,-0.855980,0.066323,-0.903913,0.684112,-1.938703,0.811120
2141408,-0.363206,-0.406377,0.523003,1.363807,-0.895289,-0.131500,0.790218,0.727790,-1.490102,-0.602512,...,-0.556147,-0.699605,-0.183470,0.332354,-0.134949,0.073055,-0.919491,-1.123110,-1.782485,-0.068743


1141410   -1.705758
1141411   -0.260084
1141412    0.013137
1141413    0.841180
1141414   -0.086323
             ...   
2141405    1.304217
2141406   -0.376348
2141407   -0.854828
2141408   -0.336876
2141409    0.775711
Name: target, Length: 1000000, dtype: float32

# Create Model - LGBM

In [5]:
model_lgb = lgb.LGBMRegressor()                 # Create an instance of Light GBM regressor
model_lgb.fit(X_train, y_train)                 # Fit the Light GBM regressor

# Light GBM - Gradient Boosting on Decision Trees Algorithm is the State-of-the-Art method for tabular data

LGBMRegressor()

# Inference

In [6]:
test = pd.read_parquet(
    'train_low_mem.parquet',
    columns=features+['target']
    )[-n:]                                               # Load test data (the last n lines)
y_test = test['target']                # Set the column `target` as a target array for assessing model performance
X_test = test.drop(columns=['target']) # Drop the column `target` from test
display(X_test, y_test)                  # Display the features table (X_test) and the target array (y_test)


Unnamed: 0,f_210,f_190,f_150,f_164,f_68,f_88,f_61,f_109,f_252,f_59,...,f_238,f_295,f_251,f_135,f_201,f_153,f_250,f_247,f_104,f_231
2141410,-0.400026,-0.949098,0.391393,-0.712090,-0.375293,-0.870640,-0.562710,0.553844,-0.550476,-0.673127,...,-1.955815,-0.547006,-0.365044,-2.241614,-1.216166,0.053764,0.622444,-1.428391,-2.118947,-0.903285
2141411,-0.435742,0.731634,0.492552,0.602305,-0.860652,0.934114,0.261181,0.052347,-1.429314,-0.508010,...,0.150434,-0.445747,-0.170956,1.498474,0.074797,0.062222,-0.824149,-0.152404,1.913229,-0.253841
2141412,-0.458567,-1.147183,0.295298,-0.169923,-0.752670,-0.649304,-0.278516,0.223367,-0.793903,-0.368141,...,-0.776187,-0.527967,-0.353926,0.942096,0.396715,0.049084,-0.881491,-1.583062,2.003347,0.430136
2141413,-0.323295,-1.732075,-0.728708,-0.650269,-0.075438,-0.804429,-0.560791,-0.308643,-1.673607,-0.450222,...,-1.086235,-0.201129,-0.191936,-1.114424,-0.250617,0.067094,-0.443954,0.167276,-1.241518,-1.859841
2141414,-0.305656,0.592400,-0.320324,-0.028336,0.258547,-1.049091,0.061537,0.709200,1.190133,0.042954,...,0.952387,0.143139,-0.231174,-0.808503,-0.022975,0.068521,0.555403,-0.303344,-1.162173,-0.026034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3141405,-0.305540,-0.742637,0.289439,0.331627,0.065594,-0.087018,0.021072,0.028391,-1.031660,-0.834213,...,-0.828049,-0.428097,-0.284951,-0.061382,0.249033,0.048098,-0.918733,-0.682983,-0.420011,-0.241190
3141406,-0.276335,0.727245,0.289439,1.668355,-0.497278,0.800487,2.325182,0.324336,-1.384874,-0.640941,...,0.629430,-0.729949,-0.317360,1.118034,-1.151997,0.048098,-0.699202,-0.851513,-0.420011,-0.230738
3141407,0.462093,-0.084063,0.289439,-0.164567,-0.569171,0.736366,-0.233757,0.103172,1.233986,-0.355024,...,0.936215,-0.363329,-0.289974,0.767820,2.027001,0.048098,0.243166,1.625858,-0.420011,-0.241285
3141408,-0.329783,-1.928777,0.289439,-0.366721,0.226759,-1.197479,-0.360540,0.098895,0.696358,1.051668,...,-1.309645,-0.375288,-0.281895,-2.028057,-0.086129,0.048098,-0.741613,-0.489971,-0.420011,-0.241190


2141410    0.678174
2141411    0.059027
2141412    0.274764
2141413    0.159570
2141414   -1.265211
             ...   
3141405    0.033600
3141406   -0.223264
3141407   -0.559415
3141408    0.009599
3141409    1.212112
Name: target, Length: 1000000, dtype: float32

In [7]:
prediction_train = model_lgb.predict(X_train) # Predict values for train set using the fitted model
prediction_test = model_lgb.predict(X_test)   # Predict values for test set using the fitted model 

# Evaluation

In [8]:
print(
    'Train Pearson r:', pearsonr(y_train, prediction_train)[0], '\n',
    'Test Pearson r:', pearsonr(y_test, prediction_test)[0],
) # Display Pearson correlation coefficient for train and test sets

Train Pearson r: 0.24688431537568206 
 Test Pearson r: 0.1140020617815317
