### This was an attempt to process date features:  unduplicate, box-cox and scale

In [1]:
import sys
import time
import numpy as np
import pandas as pd
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
from pandas.core.common import array_equivalent
import pickle

def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if array_equivalent(ia, ja):
                    dups.append(cs[i])
                    break

    return dups

def scale_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

train = pd.read_csv("train_date.csv", index_col=0)
test = pd.read_csv("test_date.csv", index_col=0)
train_test = pd.concat([train, test])
print('Concatenation OK!')
print(train_test.shape)
ntrain = train.shape[0]
ntest = test.shape[0]
del(test, train)


dups = duplicate_columns(train_test)
train_test = train_test.drop(dups, axis=1)
print('Unduplication OK!')
print(train_test.shape)

skewed_feats = train_test.apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[np.absolute(skewed_feats) > 0.3]
print(skewed_feats)
skewed_feats = skewed_feats.index
for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1.1
        train_test[feats], lam = boxcox(train_test[feats])
print('Boxcox is sucessfull!')
x_train = train_test.iloc[:ntrain, :]
x_test = train_test.iloc[ntrain:, :]
train_test_scaled, scaler = scale_data(train_test.fillna(0))
train, _ = scale_data(x_train.fillna(0), scaler)
test, _ = scale_data(x_test.fillna(0), scaler)
print('Data is scaled!')

train_ids = x_train.index
test_ids = x_test.index
col_train = x_train.columns
col_test = x_test.columns
train_df = pd.DataFrame(data=train, index=train_ids, columns=col_train)
test_df = pd.DataFrame(data=test, index=test_ids, columns=col_test)


train_df.to_pickle('train_date_df.pkl')
test_df.to_pickle('test_date_df.pkl')

Concatenation OK!
(2367495, 1156)
Unduplication OK!
(2367495, 162)
L1_S24_D693     0.766004
L1_S24_D743     0.765321
L1_S24_D772     0.762389
L1_S24_D797     1.551421
L1_S24_D801     0.772155
L1_S24_D804     0.772160
L1_S24_D811     0.773290
L1_S24_D815     0.772203
L1_S24_D904     0.852943
L1_S24_D995     0.695631
L1_S24_D1015    0.794432
L1_S24_D1058    0.755890
L1_S24_D1112    0.779996
L1_S24_D1562   -0.641304
L1_S25_D1864   -0.441510
L1_S25_D1879   -0.442039
L1_S25_D1883   -0.437473
L1_S25_D1887   -0.441082
L1_S25_D1895   -0.440807
L1_S25_D1898   -0.440346
L1_S25_D1975   -0.441125
L1_S25_D2053   -0.442616
L1_S25_D2093   -0.449440
L1_S25_D2133   -0.430401
L1_S25_D2178   -0.437898
L1_S25_D2204    0.455918
L1_S25_D2228   -1.088434
L1_S25_D2236   -0.517359
L1_S25_D2238   -0.520597
L1_S25_D2240   -0.521754
                  ...   
L1_S25_D2790   -1.397377
L1_S25_D2796    2.279939
L1_S25_D2798   -1.396426
L1_S25_D2874   -1.392283
L1_S25_D2952   -1.394953
L1_S25_D2992   -1.390911
L1_S25_D