In [1]:
import joblib
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

###  Loading the data

In [2]:
data = pd.read_csv("./dataset.csv")

In [3]:
# all columns selection
columns = list(data.columns)
columns.remove("y")

In [4]:
data_X = data[columns]
data_y = data[["y"]]

###  Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data_X.values, 
                                                 data_y.values.ravel(),
                                                 test_size=0.3,
                                                 shuffle=True,
                                                 random_state=42)

### Preprocessing steps

```
1. Remove x067, x094, x095, x096 - constant columns
2. Remove highly correlated binary variables - x246, x247, x261, x262, x270, x271, x282, x284, x300
3. Do the following mapping on categoprical or ordinal variables - ```
Categorical variables mapping:

x068 => {>=1 : 1}
x077 => {>=1 : 1}
x078 => {>=1 : 1}
x079 => {>=1 : 1}
x156 => {>=1 : 1}
x252 => {>=1 : 1}
[67, 76, 77, 78, 155, 151]

x037 => {>=2 : 2}
x049 => {>=2 : 2}
x050 => {>=2 : 2}
x051 => {>=2 : 2}
x052 => {>=2 : 2}
x053 => {>=2 : 2}
x107 => {>=2 : 2}
[36, 48, 49, 50, 51, 52, 106]

x023 => {>=3 : 3}
x155 => {>=3 : 3}
x252 => {>=3 : 3}
[22, 154, 251, 37, 38, 45, 46, 47, 53, 54, 60, 68, 79, 99, 100, 101, 107, 111, 121, 122, 148, 162, 168, 173, 174, 175, 176, 177, 178, 181, 182, 196, 227, 228, 229, 240, 250, 253]

x022 => {>=4 : 4}
x148 => {>=4 : 4}
x162 => {>=4 : 4}
x287 => {>=4 : 4}
x302 => {>=4 : 4}
[21, 147, 161, 286, 301]

x019 => {>=5 : 5}
[18]

x018 => {>=8 : 8}
[17]

['x038', 'x039', 'x046', 'x047', 'x048',
 'x054', 'x055', 'x061', 'x069', 'x080',
 'x100', 'x101', 'x102', 'x108', 'x112',
 'x122', 'x123', 'x149', 'x163', 'x169',
 'x174', 'x175', 'x176', 'x177', 'x178',
 'x179', 'x182', 'x183', 'x197', 'x228',
 'x229', 'x230', 'x241', 'x251', 'x254'] => {>=3 : 3}
 
```
```

In [6]:
def func_map(x, thresh):
    if x < thresh:
        return x
    else:
        return thresh
vfunc_map = np.vectorize(func_map)

In [7]:
class CustomTransform1(BaseEstimator, TransformerMixin):
    '''
    1. Remove x067, x094, x095, x096 - constant columns
    2. Remove highly correlated binary variables - x246, x247, x261, x262, x270, x271, x282, x284, x300
    Since using hard coded index, this transformation must be used at first
    3. Mapping of categorical varibles as mentioned above
    '''
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        list_1 = [67, 76, 77, 78, 155, 151]
        list_2 = [36, 48, 49, 50, 51, 52, 106]
        list_3 = [22, 154, 251, 37, 38, 45, 46, 
                  47, 53, 54, 60, 68, 79, 99, 100, 
                  101, 107, 111, 121, 122, 148, 162, 
                  168, 173, 174, 175, 176, 177, 178, 
                  181, 182, 196, 227, 228, 229, 240, 250, 253]
        list_4 = [21, 147, 161, 286, 301]
        list_5 = [18]
        list_8 = [17]
        
        X_[:,list_1] = vfunc_map(X_[:,list_1], 1)
        X_[:,list_2] = vfunc_map(X_[:,list_2], 2)
        X_[:,list_3] = vfunc_map(X_[:,list_3], 3)
        X_[:,list_4] = vfunc_map(X_[:,list_4], 4)
        X_[:,list_5] = vfunc_map(X_[:,list_5], 5)
        X_[:,list_8] = vfunc_map(X_[:,list_8], 8)
        
        X_ = np.delete(X_, [66, 93, 94, 95, 245, 246, 260, 261, 269, 270, 281, 283, 299], 1)
        return X_

In [8]:
class CustomTransform2(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.imputer = SimpleImputer(verbose=1, strategy="most_frequent")
        # self.imputer = IterativeImputer(max_iter=10, random_state=42, verbose=1)
    def fit(self, X, y=None):
        X_ = X.copy()
        self.imputer.fit(X_)
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        imputed = self.imputer.transform(X_)
        return imputed

In [9]:
regressor = GradientBoostingRegressor(loss='lad', 
                                      learning_rate=0.08, 
                                      n_estimators=250, # 200
                                      subsample=1.0, 
                                      criterion='friedman_mse', 
                                      min_samples_split=2, 
                                      min_samples_leaf=1, 
                                      min_weight_fraction_leaf=0.0, 
                                      max_depth=12,
                                      min_impurity_decrease=0.0, 
                                      min_impurity_split=None, 
                                      init=None, 
                                      random_state=42, 
                                      max_features=250, # 200, log2, sqrt
                                      alpha=0.9, 
                                      verbose=3, 
                                      max_leaf_nodes=None, 
                                      warm_start=False, 
                                      validation_fraction=0.1, 
                                      n_iter_no_change=2, 
                                      tol=0.0001, 
                                      ccp_alpha=0.0)

time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
model_name = 'models/model-' + time_stamp + '.pkl'

```
1. increase lr
2. reduce max_features

```

In [10]:
pipeline=Pipeline([('custom1', CustomTransform1()),
                      ('imputer1', CustomTransform2()),
                      # ('scalar1',StandardScaler()),
                      ('regressor', regressor)])

In [11]:
# # KFold cross validation
# kfold = KFold(n_splits=10, random_state=42)
# cv_results = cross_val_score(pipeline, 
#                              data.values[:, :-1], 
#                              data.values[:, -1], 
#                              cv=kfold, 
#                              scoring='mean_squared_error')
# print(cv_results)

In [12]:
model = pipeline.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1          92.7760           23.44m
         2          85.7114           23.44m
         3          79.2712           23.25m
         4          73.4302           23.08m
         5          68.0872           23.18m
         6          63.2046           23.07m
         7          58.7855           22.87m
         8          54.7497           23.00m
         9          51.0249           22.78m
        10          47.6989           22.62m
        11          44.6644           22.47m
        12          41.8982           22.31m
        13          39.3537           22.18m
        14          37.0174           22.02m
        15          34.8817           21.90m
        16          32.9400           21.74m
        17          31.1575           21.60m
        18          29.5111           21.45m
        19          28.0006           21.30m
        20          26.6403           21.17m
        21          25.3820           21.03m
        2

In [13]:
# Saving the entire pipeline model as pkl file
joblib.dump(pipeline, model_name, compress=1)

['models/model-20210328-184456.pkl']

### Prediction and Evaluation 

In [14]:
# Loading the model
# model = joblib.load(model_name)

In [15]:
predictions = model.predict(X_test).reshape(-1,1).astype(int).ravel()
predictions_train = model.predict(X_train).reshape(-1,1).astype(int).ravel()

print("Test RMSE: ")
print(mean_squared_error(y_test, predictions))
print("\n")
print("Train RMSE: ")
print(mean_squared_error(y_train, predictions_train))

Test RMSE: 
752.5397333333333


Train RMSE: 
360.3723


In [16]:
print("Train score: ", model.score(X_train, y_train))

Train score:  0.9743364406283166


In [17]:
print("Test score: ", model.score(X_test, y_test))

Test score:  0.9464559196545642


In [18]:
y_test.shape

(30000,)

In [19]:
predictions.shape

(30000,)

In [20]:
def accuracy(target: np.array, predictions: np.array, thresh=3.) -> float:
    diff = np.abs(target.ravel() - predictions.ravel()) <= thresh
    acc = np.round((diff.sum()/diff.shape[0]) * 100, 2)
    return acc

In [21]:
# Accuracy

thresh = 3

print(f"Train Acc: {accuracy(y_train, predictions_train, thresh)}%")

print(f"Test Acc: {accuracy(y_test, predictions, thresh)}%")

Train Acc: 44.58%
Test Acc: 18.77%


In [22]:
# print((y_test.astype(int)[:15]))
# print(predictions[:15])