In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def load_obj(filepath):
    with open(filepath, 'rb') as file:
        obj = pickle.load(file)
    return obj

In [None]:
cat_encoder = load_obj("../input/amex-datasetcategorical-encoders/cat_encoder.pkl")
customer2id = load_obj("../input/amex-datasetcategorical-encoders/customer2id.pkl")
id2customer = load_obj("../input/amex-datasetcategorical-encoders/id2customer.pkl")

In [None]:
count_df = pd.read_pickle("../input/amex-eda-data/all_counts_df.pkl")
na_df = pd.read_pickle("../input/amex-eda-data/all_na_df.pkl")
train_label = pd.read_csv("../input/amex-default-prediction/train_labels.csv")

print("max number of observation per customer:", count_df.num_records.max())

In [None]:
train_label['customer_ID'] = train_label['customer_ID'].apply(lambda v: customer2id[v])
train_label.set_index('customer_ID', inplace=True)
na_df = na_df.merge(train_label, on='customer_ID')

na_df.head()

In [None]:
na_stat_df = []
num_customers = len(na_df)

for colname in na_df.columns:
    percent_na = na_df[colname].sum()/num_customers/13
    na_stat_df.append({
        'colname': colname,
        "percent_na": percent_na
    })

    
na_stat_df = pd.DataFrame.from_dict(na_stat_df)
na_stat_df = na_stat_df.sort_values('percent_na', ascending=False)

na_stat_df.head()

In [None]:
print("Number of columns with 30% missing values:", len(na_stat_df[na_stat_df.percent_na>0.3]))
print()
na_stat_df.percent_na.describe()

In [None]:
plt.title("distribution of the percent of missing values in the features.")
sns.histplot(na_stat_df['percent_na'], bins=10)
plt.show()

In [None]:
plt.figure(figsize=(15, 7))
ax = sns.barplot(data=na_stat_df.head(35), x='colname', y='percent_na' )

plt.yticks(np.arange(0, 1.0, 0.1))
plt.xticks(rotation=45)
plt.show()

Lets pick the above top30 features and check their importance towards the target.

In [None]:
missing_columns = list(na_stat_df[na_stat_df.percent_na>0.3].colname.values)
data=[]
for colname in missing_columns:
    df = na_df[na_df[colname] > 0]
    den_ = df[colname].sum()
    
    df = (df.groupby('target')[[colname]].sum()/den_).reset_index()
    
    for _,row in df.iterrows():
        target = row.target
        v = row[colname]
        
        data.append({
            'colname': colname,
            'target': target,
            'v': v
        })

In [None]:
missing_stat_df = pd.DataFrame.from_dict(data)
missing_stat_df = pd.pivot(data=missing_stat_df, index='colname', columns='target', values='v')
missing_stat_df.columns = ['target_0', 'target_1' ]
missing_stat_df.head()

In [None]:
missing_stat_df.merge(na_stat_df, on='colname').sort_values('percent_na')

1. it looks like we can discard a few features with high percent of miissing values.
2. the train target ratio is (75/25) split of target(0/1).
3. for feature: D_77 if we pick all na values--> 66% belongs to target 0, 34% to target1.
   that looks to be significantly different from the overall distribution of (75, 25).

# Feature columns

In [None]:
%%time
feature_df = pd.read_pickle("../input/amex-eda-data/all_features_df.pkl")
feature_df.head()

In [None]:
remaining_feat_columns = list(set(na_df.columns) - set(missing_columns) - set(['target']))
remaining_feat_columns = [colname+"_mean" for colname in remaining_feat_columns]

print("number of remaning features:", len(remaining_feat_columns))

In [None]:
data=[]

for colname in remaining_feat_columns:
    s = feature_df[colname]
    s = s[s.isna()==False]
    data.append({
        'feat_name': colname,
        'avg': np.mean(s),
        'std': np.std(s),
        
        'feat_min': np.min(s),
        'feat_q01': np.quantile(s, 0.01),
        'feat_q90': np.quantile(s, 0.9),
        'feat_q99': np.quantile(s, 0.99),
        'feat_max': np.max(s)
    })

df = pd.DataFrame.from_dict(data)
df['r_max'] = df['feat_max'].div(df['feat_q99'])
df['r_min'] = np.abs(df['feat_min'].div(df['feat_q01']+1e-9))

df.head()

In [None]:
df.avg.describe()

In [None]:
df['std'].describe()

In [None]:
plt.title("distribution of the mean of all the features.")
plt.hist(df.avg, bins=100)
plt.show()

In [None]:
plt.title("distribution of the std of all the features.")
plt.hist(df['std'], bins=100)
plt.show()

In [None]:
plt.title("feature relation between the rmax vs std")
plt.scatter(np.log(1+df['r_max']), df['std'] )
plt.show()

1. Features with more variation are generally considered as some of the important features.
2. But the above figure indicates the impact of choosing features with more variance under outliers.

In [None]:
_, ax = plt.subplots(2, 2, figsize=(15, 5))

sns.boxplot(data=df, x='feat_min', ax=ax[0, 0])
sns.boxplot(data=df, x='feat_q01', ax=ax[0, 1])
sns.boxplot(data=df, x='feat_max', ax=ax[1, 0])
sns.boxplot(data=df, x='feat_q99', ax=ax[1, 1])

ax[0, 0].set_title("feat_min")
ax[0, 1].set_title("feat_q01")
ax[1, 0].set_title("feat_max")
ax[1, 1].set_title("feat_q99")

plt.show()

* There are a features which had the outliers in corresponding to their respecitve quantile[1%, 99%]

In [None]:
df.r_min.describe()

In [None]:
df.r_max.describe()

In [None]:
plt.title(" feat (vs) log(r_max)")
plt.xlabel("feat no")
plt.ylabel("log (r_max) ")
plt.yticks(np.arange(0, 10, 1.5))
plt.plot(np.arange(len(df)), np.log(df.r_max.sort_values()))
plt.show()

In [None]:
plt.title(" feat (vs) log(r_min)")
plt.xlabel("feat no")
plt.ylabel("log (r_max) ")
plt.yticks(np.arange(0, 10, 1.5))
plt.plot(np.arange(len(df)), np.log(1+df.r_min.sort_values()))
plt.show()

1. There are outliers in the values of features in both the maximum and minimum.
2. Outliers:
   any record for which the r_min and r_max >4x to the 99th,01st percentile
3. we can clip the values of the outlliers to np.clip(x, min_threshhold, max_threshhold)

In [None]:
df.head()

Lets look at few plots before and after value clipping.

In [None]:
feat_threshold={}
for _,row in df.iterrows():
    feat_name=row.feat_name
    q01 = row.feat_q01
    q99 = row.feat_q99
    
    feat_threshold[feat_name] = {}
    feat_threshold[feat_name]['vmin'] = q01
    feat_threshold[feat_name]['vmax'] = q99


In [None]:
df[(df.r_max > 100) | (df.r_min > 100)].shape

In [None]:
feat_names = df[(df.r_max > 10) | (df.r_min > 10)].feat_name.values
print("number of features with outliers:", len(feat_names))

for i, feat_name in enumerate(feat_names):
    vmin = feat_threshold[feat_name]['vmin']
    vmax = feat_threshold[feat_name]['vmax']
    
    _, ax = plt.subplots(1, 2, figsize=(12, 3))
    ax[0].set_title(feat_name)
    
    ax[0].hist(feature_df[feat_name], bins=100)
    ax[1].hist(np.clip(feature_df[feat_name], vmin, vmax ), bins=100)
    
    ax[0].set_xticks([])
    ax[0].set_yticks([])
    ax[1].set_xticks([])
    ax[1].set_yticks([])
    plt.show()
    print()

# lets clip the features and check the mean and variance values.

In [None]:
data=[]

for feat_name in remaining_feat_columns:
    vmin = feat_threshold[feat_name]['vmin']
    vmax = feat_threshold[feat_name]['vmax']
    
    s = feature_df[feat_name]
    s = s[s.isna()==False]
    s = np.clip(s, vmin, vmax)
    
    data.append({
        'feat_name': feat_name,
        'avg': np.mean(s),
        'std': np.std(s),
        
        'feat_min': np.min(s),
        'feat_q01': np.quantile(s, 0.01),
        'feat_q90': np.quantile(s, 0.9),
        'feat_q99': np.quantile(s, 0.99),
        'feat_max': np.max(s)
    })

df = pd.DataFrame.from_dict(data)
df['r_min'] = df['feat_min'].div(df['feat_q01'])
df['r_max'] = df['feat_max'].div(df['feat_q99'])

df.head()

In [None]:
df.avg.describe()

In [None]:
df['std'].describe()

In [None]:
plt.title("distribution of the mean of all the features.")
plt.hist(df.avg, bins=100)
plt.show()

In [None]:
plt.title("distribution of the std of all the features.")
plt.hist(df['std'], bins=100)
plt.show()

In [None]:
df.sort_values('std')

# lets plot 20 features with low variance and high variance

In [None]:
low_variance_features = df.sort_values('std').head(20).feat_name.values
high_variance_features = df.sort_values('std').tail(20).feat_name.values

In [None]:
for i, feat_name in enumerate(low_variance_features):
    vmin = feat_threshold[feat_name]['vmin']
    vmax = feat_threshold[feat_name]['vmax']
    
    _, ax = plt.subplots(1, 2, figsize=(12, 3))
    ax[0].set_title(feat_name)
    
    ax[0].hist(feature_df[feat_name], bins=100)
    ax[1].hist(np.clip(feature_df[feat_name], vmin, vmax ), bins=100)
    
    ax[0].set_xticks([])
    ax[0].set_yticks([])
    ax[1].set_yticks([])
    
    plt.show()
    print()

In [None]:
for i, feat_name in enumerate(high_variance_features):
    vmin = feat_threshold[feat_name]['vmin']
    vmax = feat_threshold[feat_name]['vmax']
    
    _, ax = plt.subplots(1, 2, figsize=(12, 3))
    ax[0].set_title(feat_name)
    
    ax[0].hist(feature_df[feat_name], bins=100)
    ax[1].hist(np.clip(feature_df[feat_name], vmin, vmax ), bins=100)
    
    ax[0].set_xticks([])
    ax[0].set_yticks([])
    ax[1].set_yticks([])
    
    plt.show()
    print()

In [None]:
feature_df[['B_31_mean','D_93_mean','R_24_mean']].describe()

In the low variance features
1. B_31 : had many values as -1
2. Many features had their values skewed to the extremes
3. After cleaning a bit few features had normal kind of distribution

1. 