In [6]:
import re
import numpy as np
import pandas as pd

import lightgbm as lgbm

In [20]:
df = pd.read_pickle('./saved/data/preprocessed_df.pkl')
print(df.shape)
print(df.columns)

(5213277, 36)
Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sold', 'date', 'wm_yr_wk', 'weekday', 'is_holiday', 'is_weekend',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'sell_price_diff',
       'id_sold_lag1_ma7', 'id_sold_lag1_ma7_diff', 'id_sold_lag1_ma28',
       'id_sold_lag1_ma28_diff', 'global_sold_lag1_ma7',
       'global_sold_lag1_ma7_diff', 'global_sold_lag1_ma28',
       'global_sold_lag1_ma28_diff', 'id_sold_lag0_ma7',
       'id_sold_lag0_ma7_diff', 'id_sold_lag0_ma28', 'id_sold_lag0_ma28_diff',
       'global_sold_lag0_ma7', 'global_sold_lag0_ma7_diff',
       'global_sold_lag0_ma28', 'global_sold_lag0_ma28_diff', 'sold_lag1',
       'avg_sold_per_id'],
      dtype='object')


## Dataset

In [21]:
# Dataset class

class Dataset:
    def __init__(self, df, cat_cols, num_cols, target_col):
        self.df = df
        self.cat_cols = cat_cols
        self.cat_encoded_cols = [col + "_encoded" for col in cat_cols]
        self.num_cols = num_cols
        self.target_col = target_col

    def create_lgbm_dataset(self, train_valid_test):
        assert train_valid_test in ("TRAIN", "VALID", "TEST", "ALL")
        if train_valid_test == 'ALL':
            data = self.df.loc[:, self.cat_encoded_cols + self.num_cols]
            label = self.df.loc[:, self.target_col].rename()
        else:
            data = self.df.loc[self.df["train_valid_test"] == train_valid_test, self.cat_encoded_cols + self.num_cols]
            label = self.df.loc[self.df["train_valid_test"] == train_valid_test, self.target_col]

        return lgbm.Dataset(
            data = data,
            label = label,
            feature_name = self.cat_encoded_cols + self.num_cols,
            categorical_feature = self.cat_encoded_cols,
        )

## Preprocess
### Label encoding

In [22]:
# feature selection
num_col_1= ["sell_price", "sell_price_diff", "avg_sold_per_id"]
num_col_2= ['sold_lag1']
num_col_3= [
    'id_sold_lag1_ma7', 'id_sold_lag1_ma7_diff', 
    'id_sold_lag1_ma28', 'id_sold_lag1_ma28_diff',
    'global_sold_lag1_ma7', 'global_sold_lag1_ma7_diff', 
    'global_sold_lag1_ma28', 'global_sold_lag1_ma28_diff'
]
NUM_COLS = num_col_1+num_col_2+num_col_3
CAT_COLS = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ['weekday','is_holiday','is_weekend']

TARGET = "sold"

In [23]:
df.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sold', 'date', 'wm_yr_wk', 'weekday', 'is_holiday', 'is_weekend',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'sell_price_diff',
       'id_sold_lag1_ma7', 'id_sold_lag1_ma7_diff', 'id_sold_lag1_ma28',
       'id_sold_lag1_ma28_diff', 'global_sold_lag1_ma7',
       'global_sold_lag1_ma7_diff', 'global_sold_lag1_ma28',
       'global_sold_lag1_ma28_diff', 'id_sold_lag0_ma7',
       'id_sold_lag0_ma7_diff', 'id_sold_lag0_ma28', 'id_sold_lag0_ma28_diff',
       'global_sold_lag0_ma7', 'global_sold_lag0_ma7_diff',
       'global_sold_lag0_ma28', 'global_sold_lag0_ma28_diff', 'sold_lag1',
       'avg_sold_per_id'],
      dtype='object')

In [45]:
from sklearn.preprocessing import StandardScaler

class FeaturePreprocessing:
    def __init__(self, cat_cols, num_cols, target_col):
        self.cat_cols = cat_cols
        self.cat_encoded_cols = [col + "_encoded" for col in cat_cols]
        self.num_cols = num_cols
        self.num_scaled_cols = [col + "_scaled" for col in num_cols]
        self.target_col = target_col

    def train_val_test_split(self, df):
        # train valid test split
        df.loc[df.d<1914, 'train_valid_test'] = 'TRAIN'
        df.loc[(df.d>=1914) & (df.d<1942), 'train_valid_test'] = 'VALID'
        df.loc[df.d>=1942, 'train_valid_test'] = 'TEST'
        return df

    def label_encoding(self, df):
        # Label encoding for cat cols
        labels = {}
        for col in self.cat_cols:
            df[col+'_encoded'] = df[col].astype('category')
            label = dict(zip(df[col+'_encoded'].cat.codes, df[col+'_encoded']))
            labels[col] = label
            df[col+'_encoded'] = df[col+'_encoded'].cat.codes
        return df, labels

    def scaling_for_num(self, df):
        # Standard scaling for num cols
        scaler = StandardScaler()

        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.dropna(subset=self.num_cols)
        
        train_df = df.loc[df.train_valid_test=='TRAIN', :]
        val_df = df.loc[df.train_valid_test=='VALID', :]
        test_df = df.loc[df.train_valid_test=='TEST', :]

        # fit the scaler to the training data
        scaler.fit(train_df[self.num_cols])

        # transform the numerical features in both training and validation sets
        train_df[self.num_scaled_cols] = scaler.transform(train_df[self.num_cols])
        val_df[self.num_scaled_cols] = scaler.transform(val_df[self.num_cols])
        test_df[self.num_scaled_cols] = scaler.transform(test_df[self.num_cols])

        df = pd.concat([train_df, val_df, test_df], axis=0)

        return df

In [51]:
train_df = df.loc[df.train_valid_test=='TRAIN', NUM_COLS]
# np.isinf(train_df.iloc[:,5].values).sum()
np.isinf(train_df.iloc[:,9].values).sum()

9315

In [70]:
# df[df['id_sold_lag1_ma7_diff']==np.inf]
df.loc[2254104, ['id', 'd', 'id_sold_lag1_ma7', 'id_sold_lag1_ma7_diff']]

id                       FOODS_3_707_CA_4_evaluation
d                                               1799
id_sold_lag1_ma7                            0.285714
id_sold_lag1_ma7_diff                            inf
Name: 2254104, dtype: object

In [83]:
a = df.loc[(df.id=='FOODS_3_707_CA_4_evaluation'), ['id', 'd', 'id_sold_lag1_ma7', 'id_sold_lag1_ma7_diff']]
a

Unnamed: 0,id,d,id_sold_lag1_ma7,id_sold_lag1_ma7_diff
2254104,FOODS_3_707_CA_4_evaluation,1799,0.285714,inf
2254105,FOODS_3_707_CA_4_evaluation,1800,0.285714,
2254106,FOODS_3_707_CA_4_evaluation,1801,0.285714,
2254107,FOODS_3_707_CA_4_evaluation,1802,0.285714,
2254108,FOODS_3_707_CA_4_evaluation,1803,0.285714,
...,...,...,...,...
2254270,FOODS_3_707_CA_4_evaluation,1965,0.000000,
2254271,FOODS_3_707_CA_4_evaluation,1966,0.000000,
2254272,FOODS_3_707_CA_4_evaluation,1967,0.000000,
2254273,FOODS_3_707_CA_4_evaluation,1968,0.000000,


In [84]:
a['id_sold_lag1_ma7'].diff(1)

2254104    NaN
2254105    0.0
2254106    0.0
2254107    0.0
2254108    0.0
          ... 
2254270    0.0
2254271    0.0
2254272    0.0
2254273    0.0
2254274    0.0
Name: id_sold_lag1_ma7, Length: 171, dtype: float64

In [88]:
idx = df[df['global_sold_lag1_ma7_diff']==np.inf].index
df.loc[idx, ['id', 'd', 'global_sold_lag1_ma7', 'global_sold_lag1_ma7_diff']]

Unnamed: 0,id,d,global_sold_lag1_ma7,global_sold_lag1_ma7_diff
211356,FOODS_1_126_TX_3_evaluation,1799,30401.714286,inf
211357,FOODS_1_126_TX_3_evaluation,1800,35064.142857,inf
211358,FOODS_1_126_TX_3_evaluation,1801,35972.714286,inf
211359,FOODS_1_126_TX_3_evaluation,1802,37253.000000,inf
211360,FOODS_1_126_TX_3_evaluation,1803,38214.285714,inf
...,...,...,...,...
5140219,HOUSEHOLD_2_474_CA_3_evaluation,1929,44961.571429,inf
5140221,HOUSEHOLD_2_474_CA_3_evaluation,1931,44634.714286,inf
5140223,HOUSEHOLD_2_474_CA_3_evaluation,1933,44635.857143,inf
5140224,HOUSEHOLD_2_474_CA_3_evaluation,1934,44645.714286,inf


In [89]:
b = df.loc[(df.id=='FOODS_1_126_TX_3_evaluation'), ['id', 'd', 'global_sold_lag1_ma7', 'global_sold_lag1_ma7_diff']]
b

Unnamed: 0,id,d,global_sold_lag1_ma7,global_sold_lag1_ma7_diff
211356,FOODS_1_126_TX_3_evaluation,1799,30401.714286,inf
211357,FOODS_1_126_TX_3_evaluation,1800,35064.142857,inf
211358,FOODS_1_126_TX_3_evaluation,1801,35972.714286,inf
211359,FOODS_1_126_TX_3_evaluation,1802,37253.000000,inf
211360,FOODS_1_126_TX_3_evaluation,1803,38214.285714,inf
...,...,...,...,...
211522,FOODS_1_126_TX_3_evaluation,1965,0.000000,
211523,FOODS_1_126_TX_3_evaluation,1966,0.000000,
211524,FOODS_1_126_TX_3_evaluation,1967,0.000000,
211525,FOODS_1_126_TX_3_evaluation,1968,0.000000,


In [90]:
b['global_sold_lag1_ma7'].diff(1)

211356            NaN
211357    4662.428571
211358     908.571429
211359    1280.285714
211360     961.285714
             ...     
211522       0.000000
211523       0.000000
211524       0.000000
211525       0.000000
211526       0.000000
Name: global_sold_lag1_ma7, Length: 171, dtype: float64

In [47]:
preprocesser = FeaturePreprocessing(CAT_COLS, NUM_COLS, TARGET)
# df1 = preprocesser.train_val_test_split(df)
# df2, labels = preprocesser.label_encoding(df)
df3 = preprocesser.scaling_for_num(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.num_scaled_cols] = scaler.transform(train_df[self.num_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.num_scaled_cols] = scaler.transform(train_df[self.num_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.num_scaled_cols] = scaler.transform(train

In [50]:
df3[['id_sold_lag1_ma7_diff', 'id_sold_lag1_ma7_diff_scaled']]

Unnamed: 0,id_sold_lag1_ma7_diff,id_sold_lag1_ma7_diff_scaled
0,0.355280,0.852315
1,0.000000,-0.005627
2,-0.177640,-0.434598
3,0.000000,-0.005627
4,0.177640,0.423344
...,...,...
5212565,0.552124,1.327661
5212736,1.361905,3.283151
5212907,0.000000,-0.005627
5213078,-2.042857,-4.938794
