# Don't Get Kicked! EDA

# Loading the data

In [None]:
import math
from pathlib import Path

import numpy as np
import pandas as pd

In [None]:
DATA_DIR = Path('/kaggle/input')

# list all file paths in DATA_DIR and its subdirectories
for filepath in DATA_DIR.rglob('*'):
     print(filepath)

In [None]:
data_filepath = DATA_DIR / 'DontGetKicked'

train_data = pd.read_csv(data_filepath / 'training.csv', index_col='RefId')

In [None]:
train_data.head()

In [None]:
# constants
RANDOM_STATE = 24
TARGET = 'IsBadBuy'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (14,8)
sns.set_theme(style='whitegrid')

# Check for missing values

In [None]:
train_data.info()

In [None]:
# utility function
def missing_value_stats(dataframe):
    count = dataframe.isna().sum().sort_values(ascending=False)
    proportion = count / dataframe.shape[0]

    missing_value_df = pd.concat([count, proportion], axis=1)
    missing_value_df.columns = ['missing values', 'proportion']
    return missing_value_df

In [None]:
train_data_missing = missing_value_stats(train_data)
train_data_missing.loc[train_data_missing.proportion > 0]

19 out of 33 columns have missing values

In [None]:
fig, ax =  plt.subplots()

sns.heatmap(train_data.isnull(), cbar=False, ax=ax)
ax.set_title('Heatmap of missing values')
plt.show()

The missing values seem to be uniformly distributed within the data.

# Convert data to appropriate types

In [None]:
train_data['PurchDate'] = pd.to_datetime(train_data.PurchDate, format='%m/%d/%Y')

In [None]:
# select feature types
def classify_features(dataframe, cardinality_threshold=5):
    cardinality = dataframe.nunique()
    dtypes = dataframe.dtypes
    
    low_cardinality = cardinality[cardinality < cardinality_threshold]
    low_cardinality_cols = list(low_cardinality.index)
    object_cols = list(dtypes[dtypes == 'object'].index)

    discrete_cols = list(set(low_cardinality_cols + object_cols))
    discrete_cols = sorted(discrete_cols)

    continuous_cols = list(set(dataframe.columns) - set(discrete_cols))
    continuous_cols = sorted(continuous_cols)
    return cardinality, continuous_cols, discrete_cols

In [None]:
cardinality, continuous_cols, discrete_cols = classify_features(train_data)
assert len(train_data.columns) == len(continuous_cols + discrete_cols)

In [None]:
continuous_cols = [
    'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
    'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
    'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
    'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice',
    'VehBCost', 'VehOdo', 'WarrantyCost', 'PurchDate'
]
cardinality[continuous_cols]

In [None]:
discrete_cols = list(set(train_data.columns) - set(continuous_cols))
cardinality[discrete_cols]

In [None]:
continuous_features = continuous_cols
discrete_features = discrete_cols.copy()
discrete_features.remove(TARGET)

# Handle missing values

In [None]:
# columns with missing values beyond a threshold
threshold = 0.2
condition = train_data_missing.proportion > threshold
missing_above = list(train_data_missing[condition].index)
missing_above

In [None]:
# flag unknowns for columns with missing
# values beyond a threshold
train_data[missing_above] = train_data[missing_above].fillna('unknown')

In [None]:
# use median for continuous features
continuous_data = train_data[continuous_features].copy()
continuous_fillna = continuous_data.median()
display(continuous_fillna)
clean_continuous_data = continuous_data.fillna(continuous_fillna)

In [None]:
# sample some columns
col_idx = 2
pre = continuous_data.iloc[:, col_idx]
post = clean_continuous_data.iloc[:, col_idx]
pd.concat([pre[pre.isna()], post[pre.isna()]], axis=1).head()

In [None]:
# use mode for continuous features
discrete_data = train_data[discrete_features].copy()

# convert the mode df to a series
discrete_fillna = np.squeeze(discrete_data.mode())
display(discrete_fillna)
clean_discrete_data = discrete_data.fillna(discrete_fillna)

In [None]:
# sample some columns
col_idx = 4
pre = discrete_data.iloc[:, col_idx]
post = clean_discrete_data.iloc[:, col_idx]
pd.concat([pre[pre.isna()], post[pre.isna()]], axis=1).head()

In [None]:
# swap variables
data = train_data.copy()
train_data = pd.concat([
    data[TARGET], clean_continuous_data, clean_discrete_data
], axis=1)

train_data_missing = missing_value_stats(train_data)
train_data_missing.loc[train_data_missing.proportion > 0]

# Continuous features

In [None]:
slide = 4

for i in range(math.ceil(len(continuous_features)/slide)):
    cols = continuous_features[slide * i: slide * (i+1)]
    display(train_data[cols].describe())

In [None]:
cols = 2
rows = math.ceil(len(continuous_features)/cols)
fig, axes = plt.subplots(rows, cols, figsize=(14, 8//cols*rows))
plt.tight_layout()

for i, col in enumerate(continuous_features):
    ax = axes[i//cols, i%cols]
    sns.histplot(data=train_data, x=col, hue=TARGET, element='step', ax=ax)
    ax.set_title(f'Histogram of {col}', y=0.88)

plt.show()

In [None]:
from scipy import stats

In [None]:
anova_results = []
for col in continuous_features:
    groupby = train_data.groupby(TARGET)[col]
    categories = train_data[TARGET].dropna().unique()
    anova_data = [
        groupby.get_group(category) for category in categories
    ]

    F, p = stats.f_oneway(*anova_data)
    anova_results.append([col, F, p])

columns = ['feature', 'F-statistic', 'p-value']
anova_df = pd.DataFrame(anova_results, columns=columns)
anova_df = anova_df.sort_values('p-value').set_index('feature')
anova_df

In [None]:
# get the columns whose p-value is statistically significant
threshold = 0.01
significant_anova = anova_df[anova_df['p-value'] < threshold]

print("Features with significant ANOVA p-value: {}".format(
    significant_anova.shape[0]
))
print("Features with insignificant ANOVA p-value: {}".format(
    anova_df.shape[0] - significant_anova.shape[0]
))

# Discrete features

In [None]:
slide = 8

for i in range(math.ceil(len(discrete_cols)/slide)):
    cols = discrete_cols[slide * i: slide * (i+1)]
    display(train_data[cols].describe())

In [None]:
# helper function
def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

In [None]:
# Chi-square and Cramer's V
chi2_results = []

for col in discrete_features:
    # contigency table
    chi2_data = pd.crosstab(train_data[col], train_data[TARGET])
    #display(chi2_data)

    # Chi-square test
    chi2, p, dof, expected = stats.chi2_contingency(chi2_data)
    
    # Cramer's V
    V = cramers_corrected_stat(chi2_data)
    
    chi2_results.append([col, chi2, p, V])

columns = ['feature', 'Chi-square statistic', 'p-value',"Cramer's V"]
chi2_df = pd.DataFrame(chi2_results, columns=columns)
chi2_df = chi2_df.sort_values("Cramer's V", ascending=False)
chi2_df.set_index('feature', inplace=True)
chi2_df

In [None]:
# get the columns whose Cramer's V correlation exceeds
# a set threshold
threshold = 0.05
significant_chi2 = chi2_df[chi2_df["Cramer's V"] > threshold]

print("Features with high Cramer's V correlation: {}".format(
    significant_chi2.shape[0]
))
print("Features with low Cramer's V correlation: {}".format(
   chi2_df.shape[0] - significant_chi2.shape[0]
))

In [None]:
# choose discrete features with cardinality lower than threshold
threshold = 20
low_cardinality = cardinality[cardinality < threshold]
low_cardinality =  low_cardinality[
    low_cardinality.index.isin(discrete_features)
]
low_cardinality

In [None]:
chi2_low_cardinality =  list(set(significant_chi2.index) & set(low_cardinality.index))
chi2_low_cardinality

In [None]:
cols = 2
rows = math.ceil(len(chi2_low_cardinality)/cols)
fig, axes = plt.subplots(rows, cols, figsize=(14, 8//cols*rows))
plt.tight_layout()

for i, col in enumerate(chi2_low_cardinality):
    ax = axes[i//cols, i%cols]
    sns.countplot(data=train_data, x=col, ax=ax)
    ax.set_title(f'Count plot of {col}', y=0.88)
    # aesthetics
    xticklabels = ax.get_xticklabels()
    if len(xticklabels) > 5:
        ax.set_xticklabels(xticklabels, rotation=15)

plt.show()

In [None]:
cols = 2
rows = math.ceil(len(chi2_low_cardinality)/cols)
fig, axes = plt.subplots(rows, cols, figsize=(14, 8//cols*rows))
plt.tight_layout()

for i, col in enumerate(chi2_low_cardinality):
    ax = axes[i//cols, i%cols]
    sns.countplot(data=train_data, x=col, hue=TARGET, ax=ax)
    ax.set_title(f'Count plot of {col} by {TARGET}',
                 y=0.88)
    # aesthetics
    xticklabels = ax.get_xticklabels()
    if len(xticklabels) > 5:
        ax.set_xticklabels(xticklabels, rotation=15)

plt.show()

# Mutual information

In [None]:
X = train_data.copy()
y = X.pop(TARGET)

# prepare data for MI scoring function
X.drop('PurchDate', axis=1, inplace=True)
is_discrete = X.columns.isin(discrete_features)

In [None]:
dtypes = X.dtypes
categorical_features = list(dtypes[dtypes == 'object'].index)
# set(categorical_features) - set(discrete_features)
print("Numeric discrete features: \n{}\n".format(
    list(set(discrete_features) - set(categorical_features) )
))
print("Categorical features: \n{}\n".format(categorical_features))

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# label encode the categories in discrete data
encoder = OrdinalEncoder(handle_unknown='use_encoded_value',
                         unknown_value=-10)
categorical = X[categorical_features]
categorical = encoder.fit_transform(categorical, y)
X[categorical_features] = categorical
X[categorical_features].head()

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
mi_scores = mutual_info_classif(
    X, y, discrete_features=is_discrete,
    random_state=RANDOM_STATE
)
mutual_info = pd.Series(mi_scores, index=X.columns,
                        name='mutual_info')
mutual_info = mutual_info.sort_values(ascending=False)
nonzero_mutual_info = mutual_info[mutual_info != 0]

print('Number of features with non-zero MI score: {}'.format(
    nonzero_mutual_info.shape[0]
))
print('Number of features with zero MI score: {}'.format(
    mutual_info.shape[0] - nonzero_mutual_info.shape[0]
))

In [None]:
print(f'Top 5: \n{nonzero_mutual_info.head()}\n')
print(f'Bottom 5: \n{nonzero_mutual_info.tail()}\n')

In [None]:
fig, ax = plt.subplots()
data = nonzero_mutual_info.head(10)

sns.barplot(x=data.values, y=data.index, ax=ax)
ax.set_title('Mutual information scores')
plt.show()

In [None]:
list(nonzero_mutual_info.head(10).index)