In [10]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import scorer, make_scorer, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

from collections import defaultdict, Counter

import matplotlib.pyplot as plt
import config
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm_notebook, tqdm, tnrange
tqdm.pandas()

from numba import jit

from sklearn.ensemble import ExtraTreesClassifier

import catboost

from scipy.stats import gmean

%load_ext autoreload
%autoreload 2

%matplotlib inline
pd.set_option('display.max_columns', 100)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# Weighted Mean encoding.
# Usage:
# X_tr_mean = mean_likelihood(X_tr, var, 'is_click')
# mean_enc_var[val_idx] = X_val[var].map(X_tr_mean['enc'])
def mean_likelihood(df, cat_var, target, alpha = 0.5):
    P_c = df.groupby(cat_var)[target].transform('mean')
    P_global = df[target].mean()
    n_c = df.groupby(cat_var)[target].transform('count')
    enc = (P_c*n_c + P_global*alpha)/(n_c + alpha)
    temp = df[[cat_var]]
    temp['enc'] = enc
    return temp.groupby(cat_var).mean()

In [6]:
# Label encoding.
# Usage:
# lb_dict, _ = label_enc(df_data[var])
def label_enc(x):
    values = x.unique()
    lb_enc = defaultdict(np.int32)
    for i, v in enumerate(values):
        lb_enc[v] = i
    return lb_enc, values

In [7]:
# One hot encoding.
# Returns numpy 2D array
def one_hot_enc(x):
    values = x.unique()
    values_dict = {v:i for i, v in enumerate(values)}
    one_hot_enc = np.zeros(shape = (len(x), len(values)))
    for idx, v in enumerate(x):
        one_hot_enc[idx, values_dict[v]] = 1
    return one_hot_enc

In [9]:
# Adds mean encoded features to given train, test data frame using categorical variables and a target.
# This is done using KFold data to prevent overfitting.
# Returns train, test data frames with mean encoding columns 'mean_enc_'
def add_mean_encoding(df_train, df_test, cat_vars, target, n_splits=10, random_state=1):
    # getting mean encoding features
    cvlist = list(KFold(n_splits = n_splits, random_state = random_state).split(df_train))
    for var in cat_vars:
        mean_enc_var = np.zeros(len(df_train))
        for tr_idx, val_idx in cvlist:
            X_tr, X_val = df_train.loc[tr_idx], df_train.loc[val_idx]
            X_tr_mean = mean_likelihood(X_tr, var, target)
            mean_enc_var[val_idx] = X_val[var].map(X_tr_mean['enc'])
            df_train[f'mean_enc_{var}'] = mean_enc_var
        df_train[f'mean_enc_{var}'] = df_train[f'mean_enc_{var}'].fillna(df_train[f'mean_enc_{var}'].mean())
        df_test[f'mean_enc_{var}'] = df_test[var].map(mean_likelihood(df_train, 
                                                                        var, target)['enc'])
        df_test[f'mean_enc_{var}'] = df_test[f'mean_enc_{var}'].fillna(df_train[f'mean_enc_{var}'].mean())
        
    
    return df_train, df_test

In [11]:
###Generating all possible pair of interactions between 2 pair of columns.
##Then removing those columns with all 0s
def add_pairwise_interactions(df):
    combos = list(combinations(list(df.columns), 2))
    colnames = list(df.columns)+['_'.join(x) for x in combos]
    
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = colnames
    
    noint_indices = [i for i,x in enumerate(list((df==0).all())) if x]
    df= df.drop(df.columns[noint_indices], axis=1)
    
    return df

In [12]:
# Train test helper to combine train and test dataset.
# mean encoding are added only on train and test separately.
# Combining is needed so that columns could be normalized or new features are added.
# Usage:
# helper = TrainTestHelper()
# train_test = helper.combine(train, test)
# train, test = helper.split(train_test)
class TrainTestHelper(object):
    def __init__(self):
        self.ntrain = None

    def combine(self, train, test):
        self.ntrain = train.shape[0]
        if isinstance(train, np.ndarray):
            return np.row_stack((train, test))
        else:
            return train.append(test).reset_index(drop=True)

    def split(self, train_test):
        if self.ntrain is None:
            return None
        if isinstance(train_test, np.ndarray):
            train = train_test[:self.ntrain, :]
            test = train_test[self.ntrain:, :]
        else:
            train = train_test.iloc[:self.ntrain, :].copy().reset_index(drop=True)
            test = train_test.iloc[self.ntrain:, :].copy().reset_index(drop=True)
        return train, test

In [56]:
# Generates new features for a classification dataset.
# If there are C class labels for target and k is set to N.
# N*C features are generated. where each feature represents 
# the sum of distance from current instance to k nearest neighbors
# of class C. More details in 
# http://davpinto.com/fastknn/articles/knn-extraction.html#understanding-the-knn-features
# This technique was used in winner solution of 
# https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/14335/1st-place-winner-solution-gilberto-titericz-stanislav-semenov
# Usage:
# newX, testX = knnExtract(X, y, test_X, k=1, holds = 5)
# where newX, testX contains additional features for train and test data respectively.
def _distance(a, b):
    return np.linalg.norm(b - a)

def _get_feat(data, X_train, y_train, class_index, k_index):
    inclass_X = X_train[y_train == class_index]
    distances = np.array([_distance(a, data) for a in inclass_X])
    sorted_distances_index = np.argsort(distances)
    nearest_index = list(sorted_distances_index[0: (k_index + 1)])
    dist = np.sum(distances[nearest_index])
    return dist

def knnExtract(X, y, test_X=None, k = 1, holds = 5):
    CLASS_NUM = len(set(y))
    res = np.empty((len(X), CLASS_NUM * k))
    kf = KFold(n_splits = holds,  shuffle = True)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        features = np.empty([0, len(X_test)])
        
        for class_index in range(CLASS_NUM):
            for k_index in range(k):
                feat = np.array([np.apply_along_axis(_get_feat, 1, X_test, X_train, y_train, class_index, k_index)])
                features = np.append(features, feat, axis = 0)
        res[test_index] = features.T
    
    test_res = None
    if test_X is not None:
        test_res = np.empty((len(test_X), CLASS_NUM * k))
        features = np.empty([0, len(test_X)])
        for class_index in range(CLASS_NUM):
            for k_index in range(k):
                feat = np.array([np.apply_along_axis(_get_feat, 1, test_X, X, y, class_index, k_index)])
                features = np.append(features, feat, axis = 0)
        test_res = features.T

    return res, test_res