### Introduction 

The main goal of the following class is an effective way of feature engineering. Since this dataset has lots of logs of users, then some features have constant values per user, but some features are variable for one user. 

Firstly, during a feature engineering we have to be very accurate to avoid any sorts of leakage, which can appear accidently, when you prepare your data. Secondly, we need to do a lot of tests of our features for developing good model. Due to the size of data, I need to optimize steps of my feature engineering and constructing datasets to fit model, because time of feature processing takes lots of time

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

In [2]:
run data.py

In [None]:
run Feature_processing.py

In [3]:
DATA = pd.read_pickle('../Data_original/All_Data.pkl')
targets = pd.read_pickle("../Data_original/Targets.pkl")

In [4]:
class Data():
    def __init__(self, data, target):
        self.data = self._clear_data(data)
        self.target = target[['ID', 'target']]
        self._get_train_test_split()
        
    def _clear_data(self, data):
        message = 'Input data of shape ' + str(data.shape) + ' converted to data of shape '
        data = data[data.KORREKTUR>=0.][data.BETRAG>=0.]
        
        data = self._is_sorted_data_by_id(data) # check whether data is sorted by id
        
        message += str(data.shape)
        print(message)
        return data
    
    def _separate_into_train_test(self, data):
        train, test = data[data.ID.isin(self.id_train)], data[data.ID.isin(self.id_test)]
        return train, test 
    
    def _is_sorted_data_by_id(self, data):
        list_of_id = data.ID.apply(lambda dt: int(dt[3:])).values
        bools = list_of_id[:-1] > list_of_id[1:]
                
        if np.sum(bools) > 0:
            print('Data is not sorted')
            data = self._sort_data_by_id(data)
            print('Data is sorted now')
            return data
        else:
            return data
                    
    def _sort_data_by_id(self, data):
        data['ID_int'] = data.ID.apply(lambda dt: int(dt[3:]))
        data = data.sort_values('ID_int')
        return data.drop(['ID_int'], axis=1)
    
    def _get_train_test_split(self, test_ID_size=0.2, test_Time_size=0.25):
        
        assert 0 < test_ID_size < 1 or 0 < test_Time_size < 1, 'Test size must be between 0 and 1'
        
        train, test = separate_data(self.data, self.target, 
                                    test_ID_size=test_ID_size, 
                                    test_Time_size=test_Time_size)
        

        self.id_train = sort_indices(train.ID.unique()) 
        self.id_test  = sort_indices(test.ID.unique()) 
        
        self.df_id_train = pd.DataFrame(self.id_train, columns=['ID'])
        self.df_id_test  = pd.DataFrame(self.id_test , columns=['ID'])
        
        self.X_train = train
        self.X_test  = test
        
        return train, test
    
            
    def fit_transform_tf_idf(self, train_sent, test_sent, name):
        model_tf_idf = TfidfVectorizer()
        model_tf_idf.fit(train_sent)
        X_train, X_test = model_tf_idf.transform(train_sent), model_tf_idf.transform(test_sent) 
        
        names = [name + '_' + str(i) for i in range(X_train.shape[1])]
        
        df_train = pd.SparseDataFrame(X_train, columns=names).fillna(0)
        df_train = pd.concat([self.df_id_train, df_train], axis=1)
        
        df_test = pd.SparseDataFrame(X_test, columns=names).fillna(0)
        df_test = pd.concat([self.df_id_test, df_test], axis=1)
        
        return df_train, df_test
    
    
    
    
    
    def simple_features(self, save=False):
        data = self.data.drop_duplicates(subset=['ID'])[['ID', 'RECHNUNGSBETRAG', 'ALTER', 'GESCHLECHT', 'VERSICHERUNG']].reset_index(drop=True)        
        data.RECHNUNGSBETRAG = np.log(data.RECHNUNGSBETRAG - np.min(data.RECHNUNGSBETRAG, 0) +  1)
        
        train, test = self._separate_into_train_test(data)
        if save: save_features(train, test, 'simple_features')
            
    def transform_betrag_histogram(self, save=False, name='betrag_histogram'):
        
        bins, _ = adaptive_boards(self.data.BETRAG.sort_values().values)
        
        def get_num(value):    
            for i in range(len(bins)-1):
                if bins[i] > value:
                    return i-1    
            return len(bins)
        
        self.X_train['b_hist'] = self.X_train.BETRAG.apply(lambda dt: str(get_num(dt)))
        self.X_test ['b_hist'] =  self.X_test.BETRAG.apply(lambda dt: str(get_num(dt)))
        
        train = construct_sentences(self.X_train, 'b_hist', dropna=False)
        test = construct_sentences(self.X_test, 'b_hist', dropna=False)
        
        train, test = self.fit_transform_tf_idf(train, test, 'betrag')   
                
        if save: save_features(train, test, 'betrag_histogram')
            

    def transform_faktor_histogram(self, save=False, name='betrag_histogram'):
        
        bins, _ = adaptive_boards(self.data.FAKTOR.sort_values().values, N=25)
        def get_num(value):    
            for i in range(len(bins)-1):
                if bins[i] > value:
                    return i-1    
            return len(bins)
        
        self.X_train['f_hist'] = self.X_train.FAKTOR.apply(lambda dt: str(get_num(dt)))
        self.X_test ['f_hist'] =  self.X_test.FAKTOR.apply(lambda dt: str(get_num(dt)))
        
        train = construct_sentences(self.X_train, 'f_hist', dropna=False)
        test = construct_sentences(self.X_test, 'f_hist', dropna=False)
        
        train, test = self.fit_transform_tf_idf(train, test, 'faktor')
        if save: save_features(train, test, 'faktor_histogram')
        
    def transform_anzahl_histogram(self, save=False, name='anzahl_histogram'):
        
        bins, _ = adaptive_boards(self.data.ANZAHL.sort_values().values, N=80)
        def get_num(value):    
            for i in range(len(bins)-1):
                if bins[i] > value:
                    return i-1    
            return len(bins)
        
        self.X_train['a_hist'] = self.X_train.ANZAHL.apply(lambda dt: str(get_num(dt)))
        self.X_test ['a_hist'] =  self.X_test.ANZAHL.apply(lambda dt: str(get_num(dt)))
        
        train = construct_sentences(self.X_train, 'a_hist', dropna=False)
        test = construct_sentences(self.X_test, 'a_hist', dropna=False)
        
        train, test = self.fit_transform_tf_idf(train, test, 'anzahl')
        if save: save_features(train, test, 'anzahl_histogram')

    def transform_faktor_stats(self, save=False, name='faktor_stats'):
        faktor_mean = self.data.groupby(['ID'])['FAKTOR'].mean()
        faktor_std = self.data.groupby(['ID'])['FAKTOR'].std().fillna(0)
        faktor_min = self.data.groupby(['ID'])['FAKTOR'].min()
        faktor_max = self.data.groupby(['ID'])['FAKTOR'].max()
        faktor_median = self.data.groupby(['ID'])['FAKTOR'].median()

        faktor_all = pd.concat([faktor_mean, faktor_std, faktor_min, faktor_max, faktor_median], axis=1, keys=
                  ['FAKTOR_mean', 'FAKTOR_std', 'FAKTOR_min', 'FAKTOR_max', 'FAKTOR_median']).reset_index()
    
        
        train, test = self._separate_into_train_test(faktor_all)
        if save: save_features(train, test, name)

        return faktor_all
    
    def transform_typ_stats(self, save=False, name='typ_stats'):
        self.data.TYP.fillna(-1, inplace=True)
        typ_mean = self.data.groupby(['ID'])['TYP'].mean()
        typ_std = self.data.groupby(['ID'])['TYP'].std().fillna(0)
        typ_min = self.data.groupby(['ID'])['TYP'].min()
        typ_max = self.data.groupby(['ID'])['TYP'].max()
        typ_median = self.data.groupby(['ID'])['TYP'].median()

        typ_all = pd.concat([typ_mean, typ_std, typ_min, typ_max, typ_median], axis=1, keys=
                  ['TYP_mean', 'TYP_std', 'TYP_min', 'TYP_max', 'TYP_median']).reset_index()
        
        train, test = self._separate_into_train_test(typ_all)
        if save: save_features(train, test, name)
            
        return typ_all
    
    def transform_betrag_stats(self, save=False, name='betrag_stats'):
        normed_values = self.data.BETRAG / self.data.ANZAHL
        self.data['BETRAG_normed_log'] = np.log(normed_values - np.min(normed_values, 0) +  1)

        betrag_mean = self.data.groupby(['ID'])['BETRAG_normed_log'].mean()
        betrag_std = self.data.groupby(['ID'])['BETRAG_normed_log'].std().fillna(0)
        betrag_min = self.data.groupby(['ID'])['BETRAG_normed_log'].min()
        betrag_max = self.data.groupby(['ID'])['BETRAG_normed_log'].max()
        betrag_median = self.data.groupby(['ID'])['BETRAG_normed_log'].median()

        betrag_all = pd.concat([betrag_mean, betrag_std, betrag_min, betrag_max, betrag_median], axis=1, keys=
                  ['BETRAG_mean', 'BETRAG_std', 'BETRAG_min', 'BETRAG_max', 'BETRAG_median']).reset_index()
        
        train, test = self._separate_into_train_test(betrag_all)
        if save: save_features(train, test, name)
            
        return betrag_all
    
    def transform_nummer_tf_idf(self, save=False):
        train = construct_sentences(self.X_train, 'NUMMER', dropna=False)
        test = construct_sentences(self.X_test, 'NUMMER', dropna=False)

        train, test = self.fit_transform_tf_idf(train, test, 'nummer')   
        
        if save: save_features(train, test, 'nummer_tfidf')

    def transform_nummer_kat_tf_idf(self, save=False):
        train = construct_sentences(self.X_train, 'NUMMER_KAT', dropna=False)
        test = construct_sentences(self.X_test, 'NUMMER_KAT', dropna=False)

        train, test = self.fit_transform_tf_idf(train, test, 'nummer_kat')   
        
        if save: save_features(train, test, 'nummer_kat_tfidf')
        
    def transform_art_tf_idf(self, save=False):
        train = construct_sentences(self.X_train, 'ART', dropna=False)
        test = construct_sentences(self.X_test, 'ART', dropna=False)

        train, test = self.fit_transform_tf_idf(train, test, 'art')   
        
        if save: save_features(train, test, 'art_tfidf')

    def transform_leistung_tf_idf(self, save=False):
        train = construct_sentences(self.X_train, 'LEISTUNG', dropna=False)
        test = construct_sentences(self.X_test, 'LEISTUNG', dropna=False)

        train, test = self.fit_transform_tf_idf(train, test, 'leistung')   
        
        if save: save_features(train, test, 'leistung_tfidf')
        
        
    def fit(self):
        self.simple_features(save=True)
        self.transform_betrag_histogram(save=True)
        self.transform_faktor_histogram(save=True)
        self.transform_anzahl_histogram(save=True)
        self.transform_faktor_stats(save=True)
        self.transform_typ_stats(save=True)
        self.transform_betrag_stats(save=True)
        self.transform_nummer_tf_idf(save=True)
        self.transform_nummer_kat_tf_idf(save=True)
        self.transform_art_tf_idf(save=True)
        self.transform_leistung_tf_idf(save=True)
        

In [5]:
d = Data(DATA, targets)

Input data of shape (3275027, 17) converted to data of shape (3275022, 17)
Average length of records in Train data:  6.63
Average length of records in Test  data:  2.93
35.0% of data was dropped


In [6]:
%%time
d.fit()

HBox(children=(IntProgress(value=0, max=295974), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56179), HTML(value='')))




HBox(children=(IntProgress(value=0, max=295974), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56179), HTML(value='')))




HBox(children=(IntProgress(value=0, max=295974), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56179), HTML(value='')))




HBox(children=(IntProgress(value=0, max=295974), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56179), HTML(value='')))




HBox(children=(IntProgress(value=0, max=295974), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56179), HTML(value='')))




HBox(children=(IntProgress(value=0, max=295974), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56179), HTML(value='')))




HBox(children=(IntProgress(value=0, max=295974), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56179), HTML(value='')))


CPU times: user 17min 28s, sys: 36.1 s, total: 18min 4s
Wall time: 17min 34s


Algorithm:
    1. Randomly split ID-s of train and test into 2 parts
    2. Choose separator to devide past (train data) and future (test data).
    
     All    |+|-| < Train
    data    -----
            |-|+| < Test
            
          ---------> 
             Time