### Introduction 

The main goal of the following class is an effective way of feature engineering. Since this dataset has lots of logs of users, then some features have constant values per user, but some features are variable for one user. 

Firstly, during a feature engineering we have to be very accurate to avoid any sorts of leakage, which can appear accidently, when you prepare your data. Secondly, we need to do a lot of tests of our features for developing good model. Due to large data size, I need to optimize steps of my feature engineering and constructing datasets to fit model, because time of feature processing takes lots of time

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

In [2]:
DATA = pd.read_pickle('../Data_original/All_Data.pkl')
targets = pd.read_pickle("../Data_original/Targets.pkl")

In [115]:
class Data():
    def __init__(self, data, target):
        self.data = data[data.KORREKTUR>=0.][data.BETRAG>=0.]
        self.target = target[['ID', 'target']]
        self.is_sorted_data_by_id(data)
        self.users = data.ID.unique()
        self.right_users = data[data.target==0].ID.unique()
        self.fraud_users = data[data.target==1].ID.unique()
        self.indices = self.get_indices(data.ID)
        self.get_train_test_split()
            
    def sort(self, data):
        data['ID_int'] = data.ID.apply(lambda dt: int(dt[3:]))
        data = data.sort_values('ID_int')
        return data.drop(['ID_int'], axis=1)
    
    def sort_indices(self, arr):
        arr = [[i, int(i[3:])] for i in arr]
        return [i[0] for i in sorted(arr, key=lambda dt: dt[1])]
    
    def get_train_test_split(self, test_size=0.1):
        assert 0 < test_size < 1, 'Test size must be between 0 and 1'
        
        threshold_right_users = int((1 - test_size) * len(self.right_users))
        threshold_fraud_users = int((1 - test_size) * len(self.fraud_users))
        
        train = np.concatenate([self.right_users[:threshold_right_users], 
                                self.fraud_users[:threshold_fraud_users]], axis=None)
        test  = np.concatenate([self.right_users[threshold_right_users:], 
                                self.fraud_users[threshold_fraud_users:]], axis=None)
        
        self.id_train = self.sort_indices(train) 
        self.id_test  = self.sort_indices(test) 
        
        self.df_id_train = pd.DataFrame(self.id_train, columns=['ID'])
        self.df_id_test  = pd.DataFrame(self.id_test , columns=['ID'])
        
        self.X_train = self.data[self.data.ID.isin(train)]
        self.X_test  = self.data[self.data.ID.isin(test)]
        
        return train, test
    
    def separate_into_train_test(self, data):
        train, test = data[data.ID.isin(self.id_train)], data[data.ID.isin(self.id_test)]
        return train, test
    
    def get_indices(self, list_of_id):
        current_ID = ''
        current_subset = []
        indices = []

        for index, ID_name in enumerate(list_of_id):
            if ID_name == current_ID:
                current_subset.append(index)
            else:
                if index > 0: 
                    indices.append(current_subset) # finish the subset and start new one
                current_ID = ID_name
                current_subset = [index]

        indices.append(current_subset) # for the last one

        return indices
    
    def transform_betrag_stats(self, save=False, name='betrag_stats'):
        normed_values = self.data.BETRAG / self.data.ANZAHL
        self.data['BETRAG_normed_log'] = np.log(normed_values - np.min(normed_values, 0) +  1)

        betrag_mean = self.data.groupby(['ID'])['BETRAG_normed_log'].mean()
        betrag_std = self.data.groupby(['ID'])['BETRAG_normed_log'].std().fillna(0)
        betrag_min = self.data.groupby(['ID'])['BETRAG_normed_log'].min()
        betrag_max = self.data.groupby(['ID'])['BETRAG_normed_log'].max()
        betrag_median = self.data.groupby(['ID'])['BETRAG_normed_log'].median()

        betrag_all = pd.concat([betrag_mean, betrag_std, betrag_min, betrag_max, betrag_median], axis=1, keys=
                  ['BETRAG_mean', 'BETRAG_std', 'BETRAG_min', 'BETRAG_max', 'BETRAG_median']).reset_index()
        
        if save: self.save_features(betrag_all, name)
            
        return betrag_all
        
        
    def transform_betrag_histogram(self, save=False, name='betrag_histogram'):
        
        bins, _ = self.adaptive_boards(self.data.BETRAG.sort_values().values)
        
        def get_num(value):    
            for i in range(len(bins)-1):
                if bins[i] > value:
                    return i-1    
            return len(bins)
        
        self.X_train['b_hist'] = self.X_train.BETRAG.apply(lambda dt: str(get_num(dt)))
        self.X_test ['b_hist'] =  self.X_test.BETRAG.apply(lambda dt: str(get_num(dt)))
        
        train = self.construct_sentences(self.X_train, 'b_hist', dropna=False)
        test = self.construct_sentences(self.X_test, 'b_hist', dropna=False)
        
        train, test = self.fit_transform_tf_idf(train, test, 'betrag')   
        
        #test.columns[1:] = train.columns[1:] = ['betrag_' + str(i) for i in train.columns[1:]]
        
        self.save_features2(train, test, 'betrag_histogram')
            
            
    def transform_faktor_stats(self, save=False, name='faktor_stats'):
        faktor_mean = self.data.groupby(['ID'])['FAKTOR'].mean()
        faktor_std = self.data.groupby(['ID'])['FAKTOR'].std().fillna(0)
        faktor_min = self.data.groupby(['ID'])['FAKTOR'].min()
        faktor_max = self.data.groupby(['ID'])['FAKTOR'].max()
        faktor_median = self.data.groupby(['ID'])['FAKTOR'].median()

        faktor_all = pd.concat([faktor_mean, faktor_std, faktor_min, faktor_max, faktor_median], axis=1, keys=
                  ['FAKTOR_mean', 'FAKTOR_std', 'FAKTOR_min', 'FAKTOR_max', 'FAKTOR_median']).reset_index()
    
        if save: self.save_features(faktor_all, name)

        return faktor_all
    
    def transform_typ_stats(self, save=False, name='typ_stats'):
        self.data.TYP.fillna(-1, inplace=True)
        typ_mean = self.data.groupby(['ID'])['TYP'].mean()
        typ_std = self.data.groupby(['ID'])['TYP'].std().fillna(0)
        typ_min = self.data.groupby(['ID'])['TYP'].min()
        typ_max = self.data.groupby(['ID'])['TYP'].max()
        typ_median = self.data.groupby(['ID'])['TYP'].median()

        typ_all = pd.concat([typ_mean, typ_std, typ_min, typ_max, typ_median], axis=1, keys=
                  ['TYP_mean', 'TYP_std', 'TYP_min', 'TYP_max', 'TYP_median']).reset_index()
        
        if save: self.save_features(typ_all, name)
            
        return typ_all
    
    def save_features(self, data, name):
        train, test = self.separate_into_train_test(data)
        train.to_pickle('../Features/Train/' + name + '.pkl')
        test.to_pickle('../Features/Test/' + name + '.pkl')
        
    def save_features2(self, train, test, name):
        train.to_pickle('../Features/Train/' + name + '.pkl')
        test.to_pickle('../Features/Test/' + name + '.pkl')
        
    def log_rechnungsbetrag(self):
        self.data.RECHNUNGSBETRAG = np.log(self.data.RECHNUNGSBETRAG - np.min(self.data.RECHNUNGSBETRAG, 0) +  1)

    def prepare_dataset(self):
        data = self.data.drop_duplicates(subset=['ID'])[['ID', 'RECHNUNGSBETRAG', 'ALTER', 'GESCHLECHT', 'VERSICHERUNG', 'target']].reset_index(drop=True)

        self.log_rechnungsbetrag()
        data = data.merge(self.transform_betrag_stats(), on='ID', how='inner')
        data = data.merge(self.transform_betrag_histogram(), on='ID', how='inner')
        data = data.merge(self.transform_typ_stats(), on='ID', how='inner')
        data = data.merge(self.transfrom_faktor_stats(), on='ID', how='inner')

        return data
    
    def is_sorted_data_by_id(self, data):
        list_of_id = data.ID.apply(lambda dt: int(dt[3:])).values
        bools = list_of_id[:-1] > list_of_id[1:]
                
        if np.sum(bools) > 0:
            print('Data is not sorted')
            data = self.sort(data)
            print('Data is sorted now')
            return data
        else:
            print('Data is correct')
            return data

    def construct_sentences(self, data, column_name, dropna=True):
        sentences = []
                    
        series = data[column_name]
        uniques = self.sort_indices(data.ID.unique())
        indices = self.get_indices(data.ID)

        if not dropna:
            series = series.fillna('NAN' + column_name)

        for index, ID_name in tqdm(enumerate(uniques), total=len(uniques)):
            if dropna:
                sentences.append(list(series.iloc[indices[index]].dropna()))
            else:
                sentences.append(list(series.iloc[indices[index]]))

        sentences = [' '.join(sent) for sent in sentences]

        return sentences
    
    def fit_transform_tf_idf(self, train_sent, test_sent, name):
        self.model_tf_idf = TfidfVectorizer()
        self.model_tf_idf.fit(train_sent)
        X_train, X_test = self.model_tf_idf.transform(train_sent), self.model_tf_idf.transform(test_sent) 
        names = [name + '_' + str(i) for i in range(X_train.shape[1])]
        
        df_train = pd.SparseDataFrame(X_train, columns=names).fillna(0)
        df_train = pd.concat([self.df_id_train, df_train], axis=1)
        
        df_test = pd.SparseDataFrame(X_test, columns=names).fillna(0)
        df_test = pd.concat([self.df_id_test, df_test], axis=1)
        
        return df_train, df_test

    def transform_nummer_tf_idf(self):
        train = self.construct_sentences(self.X_train, 'NUMMER', dropna=False)
        test = self.construct_sentences(self.X_test, 'NUMMER', dropna=False)

        train, test = self.fit_transform_tf_idf(train, test, 'nummer')   
        
        self.save_features2(train, test, 'nummer_tfidf')

    def transform_nummer_kat_tf_idf(self):
        train = self.construct_sentences(self.X_train, 'NUMMER_KAT', dropna=False)
        test = self.construct_sentences(self.X_test, 'NUMMER_KAT', dropna=False)

        train, test = self.fit_transform_tf_idf(train, test, 'nummer_kat')   
        
        self.save_features2(train, test, 'nummer_kat_tfidf')
        
    def transform_art_tf_idf(self):
        train = self.construct_sentences(self.X_train, 'ART', dropna=False)
        test = self.construct_sentences(self.X_test, 'ART', dropna=False)

        train, test = self.fit_transform_tf_idf(train, test, 'art')   
        
        self.save_features2(train, test, 'art_tfidf')

    def transform_leistung_tf_idf(self):
        train = self.construct_sentences(self.X_train, 'LEISTUNG', dropna=False)
        test = self.construct_sentences(self.X_test, 'LEISTUNG', dropna=False)

        train, test = self.fit_transform_tf_idf(train, test, 'leistung')   
        
        self.save_features2(train, test, 'leistung_tfidf')
        
    def adaptive_boards(self, arr):
        cur_boarder = 0
        boards = [0.]
        counter = 0
        counters = []
        N = 2000
        for cur_val in arr:
            if counter < N:
                cur_boarder = cur_val
                counter += 1
            else:
                if cur_val == cur_boarder:
                    counter += 1
                else:
                    counters.append(counter)
                    boards.append(cur_boarder)
                    cur_boarder = cur_val
                    counter = 1 

        counters.append(counter)
        boards.append(cur_boarder)
        return boards, counters
    
    def common_features(self, save=False):
        data = self.data.drop_duplicates(subset=['ID'])[['ID', 'RECHNUNGSBETRAG', 'ALTER', 'GESCHLECHT', 'VERSICHERUNG']].reset_index(drop=True)
        data.RECHNUNGSBETRAG = np.log(data.RECHNUNGSBETRAG + 1)
        
        if save: self.save_features(data, 'simple_features')
        

In [116]:
d = Data(DATA, targets)

Data is correct


In [118]:
%%time
d.transform_betrag_histogram(save=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


HBox(children=(IntProgress(value=0, max=342911), HTML(value='')))

HBox(children=(IntProgress(value=0, max=38102), HTML(value='')))

CPU times: user 11min 45s, sys: 5.04 s, total: 11min 50s
Wall time: 11min 44s


In [114]:
DATA

Unnamed: 0,ID,KORREKTUR,RECHNUNGSBETRAG,ALTER,GESCHLECHT,VERSICHERUNG,FACHRICHTUNG,NUMMER,NUMMER_KAT,TYP,ANZAHL,FAKTOR,BETRAG,ART,LEISTUNG,target
0,ID_1,0.0,330.970001,53.570385,0,1,1,A_178,AA_10,0.0,2,2.30,24.400000,,C_1,0
1,ID_1,0.0,330.970001,53.570385,0,1,1,A_1884,AA_2,0.0,1,2.30,13.410000,,C_1,0
2,ID_1,0.0,330.970001,53.570385,0,1,1,A_1,AA_2,0.0,1,2.30,10.720000,,C_1,0
3,ID_1,0.0,330.970001,53.570385,0,1,1,A_168,AA_10,0.0,1,2.50,23.020000,,C_1,0
4,ID_1,0.0,330.970001,53.570385,0,1,1,A_172,AA_10,0.0,1,2.30,24.400000,,C_1,0
5,ID_1,0.0,330.970001,53.570385,0,1,1,A_174,AA_10,0.0,1,3.50,81.599998,,C_1,0
6,ID_2,0.0,455.200012,83.382721,1,1,1,A_765,AA_13,0.0,1,1.15,2.010000,,C_6,0
7,ID_2,0.0,455.200012,83.382721,1,1,1,A_764,AA_13,0.0,1,1.15,2.010000,,C_6,0
8,ID_2,0.0,455.200012,83.382721,1,1,1,A_1257,AA_3,0.0,3,3.50,48.959999,,C_1,0
9,ID_2,0.0,455.200012,83.382721,1,1,1,A_777,AA_13,0.0,1,1.15,0.000000,,C_6,0
