In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc

from utils.schemas import *
from utils.functions import *

In [2]:
data_folder = 'input'

In [3]:
train = pd.read_csv(data_folder+'/train_generated_0.csv', dtype = schema_generated_0)
test = pd.read_csv(data_folder+'/test_generated_0.csv', dtype = schema_generated_0)

In [4]:
train.shape, test.shape

((590540, 552), (506691, 552))

In [5]:
train_index = train.shape[0]
test_index = test.shape[0]
df = pd.concat([train, test], axis=0)
df.shape

(1097231, 552)

In [6]:
del train
del test
gc.collect()

7

In [7]:
df['Transaction_day_of_week'] = np.floor((df['TransactionDT'] / (3600 * 24) - 1) % 7)
df['Transaction_hour'] = np.floor(df['TransactionDT'] / 3600) % 24

In [8]:
cols1 = [x for x in df.columns if 'card' in x and 'fe' not in x]
group1 = df.groupby(cols1).count()['TransactionDT']
group1.name = 'N1'
df = df.join(group1, on=cols1)

In [9]:
cols1 = [x for x in df.columns if 'addr' in x]
group1 = df.groupby(cols1).count()['TransactionDT']
group1.name = 'N2'
df = df.join(group1, on=cols1)

In [10]:
cols1 = [x for x in df.columns if 'C' in x and 'Product' not in x]
group1 = df.groupby(cols1).count()['TransactionDT'].astype('int64')
group1.name = 'N3'
df = df.join(group1, on=cols1)

In [11]:
cols1 = [x for x in df.columns if 'mail' in x]
group1 = df.groupby(cols1).count()['TransactionDT']
group1.name = 'N4'
df = df.join(group1, on=cols1)

In [12]:
# df = df.drop(['N1', 'N2', 'N3', 'N4'], axis=1)

In [13]:
# for c in ['N1', 'N2', 'N3', 'N4']:
#     df = freq_encoder(df, c, c, 0.005)

In [12]:
from sklearn.cluster import KMeans

In [13]:
n_cls = [2, 4, 8, 16, 32, 64]

In [14]:
cols1 = [x for x in df.columns if 'C' in x and 'Product' not in x]
cols2 = [x for x in df.columns if 'card' in x and 'fe' not in x]
cols3 = [x for x in df.columns if 'V' in x]

cols_group = [cols1, cols2, cols3]

In [15]:
for n in n_cls:
    m = 1
    for cols in cols_group:
        print('Generation feature:\tgroup_{0}_n_{1}'.format(m, n))
        kmeans = KMeans(n_clusters=n, random_state=42).fit(df[cols].fillna(-1))
        df['group_{0}_n_{1}'.format(m, n)] = kmeans.labels_
        m += 1

Generation feature:	group_1_n_2
Generation feature:	group_2_n_2
Generation feature:	group_3_n_2
Generation feature:	group_1_n_4
Generation feature:	group_2_n_4
Generation feature:	group_3_n_4
Generation feature:	group_1_n_8
Generation feature:	group_2_n_8
Generation feature:	group_3_n_8
Generation feature:	group_1_n_16
Generation feature:	group_2_n_16
Generation feature:	group_3_n_16
Generation feature:	group_1_n_32
Generation feature:	group_2_n_32
Generation feature:	group_3_n_32
Generation feature:	group_1_n_64
Generation feature:	group_2_n_64
Generation feature:	group_3_n_64


In [17]:
train = df.iloc[:train_index, :]
test = df.iloc[train_index:, :]

In [18]:
train.to_csv(data_folder+'/train_ft_eng_0.csv', header=True, index=None)
print('train guardado')
test.to_csv(data_folder+'/test_ft_eng_0.csv', header=True, index=None)
print('test guardado')

train guardado
test guardado


In [5]:
train = pd.read_csv(data_folder+'/train_ft_eng_0.csv', dtype = schema_generated_0)
test = pd.read_csv(data_folder+'/test_ft_eng_0.csv', dtype = schema_generated_0)

In [6]:
train.shape, test.shape

((590540, 576), (506691, 576))

In [7]:
train.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,group_3_n_8,group_1_n_16,group_2_n_16,group_3_n_16,group_1_n_32,group_2_n_32,group_3_n_32,group_1_n_64,group_2_n_64,group_3_n_64
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,6,0,0,16,0,0,49,3
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,18,0,0,25,3
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,18,0,0,25,3
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,0,0,0,0,22,0,0,33,17,3
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,4,0,2,15,0,15,26,0,45,15


In [8]:
train_index = train.shape[0]
test_index = test.shape[0]
df = pd.concat([train, test], axis=0)
df.shape

(1097231, 576)

In [9]:
def dev_name(dataframe):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]

    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    gc.collect()
    
    return dataframe

In [12]:
df['C13_vs_C1'] = np.log1p(df.C13/df.C1)

In [13]:
df['D1_vs_D15'] = np.log1p(df.D1/df.D15)

In [14]:
df['TransactionAmt_vs_mean'] = np.log1p(df.TransactionAmt/df.TransactionAmt.values.mean())

In [16]:
df['N3_vs_N1'] = np.log1p(df.N3/df.N1)