In [None]:
# default_exp core

# Feature Overlap Analyzer

> This library will provide method to analyze the feature matrix and identify the data points which has overlapping feature combinations for different labels.

In [None]:
#hide
from nbdev.showdoc import *
import os, sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import pairwise_distances, pairwise_distances_argmin_min

In [None]:
def encode_feats(df, ignore_cols=[], target_col=None):
    df_enc = df.copy()
    
    # drop ignored cols
    if len(ignore_cols):
        df_enc = df_enc.drop(ignore_cols, axis=1, errors='ignore')
        
    # label encode cols 
    obj_cols = df_enc.columns[df_enc.dtypes == object]
    df_enc[obj_cols] = df_enc[obj_cols].apply(LabelEncoder().fit_transform)
    
    return df_enc

In [None]:
def get_overlapping_examples(df0, df1, max_size_df0=1000, max_size_df1=100000):
    '''
    Get the points of class 0 which has atleast 1 class 1 example with same feature combination.
    '''
    df0_all = df0.head(max_size_df0)
    df1_all = df1.head(max_size_df1)
    
    min_dist_idx, min_dist = pairwise_distances_argmin_min(df0_all.values, 
                                                           df1_all.values, 
                                                           metric='manhattan')
    zero_dist_idx = np.where(min_dist==0)
    
    return df0_all.iloc[zero_dist_idx], df1_all.iloc[min_dist_idx[zero_dist_idx]]

In [None]:
#hide
# data source - https://www.kaggle.com/c/cat-in-the-dat/data?select=train.csv
df = pd.read_csv('data/train.csv.gz')
df_enc = encode_feats(df, ignore_cols=['id', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5'])

In [None]:
df0 = df_enc[df_enc.target==0].drop('target', axis=1).reset_index(drop=True)
df1 = df_enc[df_enc.target==1].drop('target', axis=1).reset_index(drop=True)

df0.shape, df1.shape

((208236, 12), (91764, 12))

In [None]:
df0_matching, df1_matching = get_overlapping_examples(df0, df1)

In [None]:
df0_matching.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,day,month
21,0,0,0,0,1,2,3,4,5,1,1,11
39,0,0,1,1,0,2,5,4,0,2,1,1
45,0,0,0,1,1,2,4,4,5,1,2,2
95,0,0,1,1,1,1,2,3,5,2,2,3
97,0,1,0,0,1,1,1,1,1,0,1,12


In [None]:
df1_matching.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,day,month
56431,0,0,0,0,1,2,3,4,5,1,1,11
46975,0,0,1,1,0,2,5,4,0,2,1,1
16266,0,0,0,1,1,2,4,4,5,1,2,2
12960,0,0,1,1,1,1,2,3,5,2,2,3
12297,0,1,0,0,1,1,1,1,1,0,1,12
