In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import seaborn as sns
from scipy.sparse import vstack, hstack, csr_matrix, save_npz, load_npz
sns.set()
import gc

from utils.schemas import *
from utils.functions import *

In [2]:
df_imp = pd.read_csv('docs/ft_importances_20190811.csv')

In [3]:
df_imp.head(10)

Unnamed: 0,feature,importance
0,TransactionAmt,3.0
1,card1,2.204488
2,C13,2.070567
3,N1,1.881225
4,C1,1.691782
5,card2,1.664059
6,addr1,1.501876
7,N3,1.417283
8,D15,1.38475
9,C14,1.305814


In [4]:
X_cols = df_imp[df_imp.importance > 0.1].feature.to_list()
# X_cols = df_imp.feature[:75].to_list()

In [3]:
data_folder = 'input'

In [4]:
train = pd.read_csv(data_folder+'/train_ft_eng_1.csv', dtype = schema_ft_eng_1)
test = pd.read_csv(data_folder+'/test_ft_eng_1.csv', dtype = schema_ft_eng_1)

In [5]:
train = csr_matrix(train)
test = csr_matrix(test)
gc.collect()

429

In [6]:
print('Saving train data...')
save_npz(data_folder+'/compressed_train_ft_eng_1.npz', train.tocsr(), compressed=True)
print('Saving test data...')
save_npz(data_folder+'/compressed_test_ft_eng_1.npz',  test.tocsr(),  compressed=True)
print('Completed!')

Saving train data...
Saving test data...
Completed!


In [4]:
train.shape

(590540, 576)

In [8]:
train.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,group_3_n_8,group_1_n_16,group_2_n_16,group_3_n_16,group_1_n_32,group_2_n_32,group_3_n_32,group_1_n_64,group_2_n_64,group_3_n_64
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,6,0,0,16,0,0,49,3
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,18,0,0,25,3
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,18,0,0,25,3
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,0,0,0,0,22,0,0,33,17,3
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,4,0,2,15,0,15,26,0,45,15


In [8]:
test['isFraud'] = np.nan
train_index = train.shape[0]
test_index = test.shape[0]
df = pd.concat([train, test], axis=0)
df.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


(1097231, 77)

In [9]:
del train
del test
gc.collect()

21

In [10]:
cat_ft_sel = list()
for c in df.columns:
    if c in cat_ft:
        cat_ft_sel.append(c)
cat_ft_sel[:4]

['DeviceInfo', 'M5', 'M6', 'M6_fe2']

In [11]:
def del_col_same_value(df):
    new_sel_cols = list()
    for c in df.columns:
        if len(df[c].unique()) > 1:
            new_sel_cols.append(c)
    return df[new_sel_cols]
    

def from_df_to_sparse(df):
    
    df_dummies = csr_matrix(pd.DataFrame())
    
    for c in X_cols:
        if c in cat_ft_sel and c not in ['isFraud', 'TransactionDT']:
            if len(df[c].unique()) > 1000:
                dums = csr_matrix(del_col_same_value(pd.get_dummies(pd.cut(df[c], 256), prefix=c, dummy_na=False)))
                df_dummies = hstack([df_dummies, dums])
            else:
                dums = csr_matrix(del_col_same_value(pd.get_dummies(df[c], prefix=c, dummy_na=False)))
                df_dummies = hstack([df_dummies, dums])
        elif c not in cat_ft_sel + ['isFraud', 'TransactionDT']:
            if df[c].isna().sum() == 0:
                dums = csr_matrix(pd.get_dummies(pd.cut(df[c], 256), prefix=c, dummy_na=False))
                df_dummies = hstack([df_dummies, dums])
            else:
                dums = csr_matrix(del_col_same_value(pd.get_dummies(pd.cut(df[c], 256), prefix=c, dummy_na=True)))
                df_dummies = hstack([df_dummies, dums])
        else:
            pass
            
    return df_dummies

In [12]:
df_final = from_df_to_sparse(df)
df_final.shape

(1097231, 13045)

In [13]:
#Transform data using small groups to reduce memory usage
# m = 50000
# train_sparse = vstack([csr_matrix(train[X_cols].iloc[i*m:(i+1)*m,:]) for i in range(train.shape[0] // m + 1)])
print('Saving train data...')
save_npz(data_folder+'/train.npz', df_final.tocsr()[:train_index], compressed=True)
print('Saving test data...')
save_npz(data_folder+'/test.npz',  df_final.tocsr()[train_index:],  compressed=True)
print('Completed!')

Saving train data...
Saving test data...
Completed!


In [78]:
train_sparse.shape

(590540, 243)