In [1]:
# Pandas for managing datasets
import numpy as np
import pandas as pd

In [2]:
# math for operating numbers
import math

In [3]:
# Change pd displayg format for float
pd.options.display.float_format = '{:,.2f}'.format

# to show complete output of a cell: eg.
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(dfp.apply(lambda x:x.unique().size))

In [4]:
# # Matplotlib for additional customization
# from matplotlib import pyplot as plt
# %matplotlib inline

In [5]:
# # Seaborn for plotting and styling
# import seaborn as sns
# #Seaborn set() to set aesthetic parameters in one step.
# sns.set() 


In [6]:
# Read dataset
df = pd.read_csv('../../datasets/homecdt_eda/application_train.csv')

In [7]:
df.shape

(307511, 122)

In [8]:
# Randomized sampling from original dataset.
# This is just for simplifying the development process
# After coding is complete, should replace all dfp-->df, and remove this cell
# Reference: https://yiidtw.github.io/blog/2018-05-29-how-to-shuffle-dataframe-in-pandas/

dfp= df.sample(n = 1000).reset_index(drop=True)
dfp.shape

(1000, 122)

In [9]:
dfp.head(3)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,337243,0,Cash loans,F,Y,Y,2,540000.0,760225.5,58963.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
1,295703,0,Cash loans,F,N,Y,0,126000.0,675000.0,21906.0,...,0,0,0,0,0.0,0.0,0.0,0.0,2.0,6.0
2,317316,0,Cash loans,M,Y,Y,1,76500.0,1125000.0,33025.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0


---

# Tool: Get numerical/ categorical variables(columns) from a dataframe

In [10]:
def get_num_df (data_df, unique_value_threshold: int):
    """ 
    Output: a new dataframe with columns of numerical variables from the input dataframe.
    Input: 
        data_df: original dataframe, 
        unique_value_threshold(int): number of unique values of each column
    e.g. If we define a column with > 3 unique values as being numerical variable, unique_value_threshold = 3
    """
    num_mask = data_df.apply(lambda x:x.unique().size > unique_value_threshold,axis=0) 
    num_df = data_df[data_df.columns[num_mask]]
    return num_df

def get_cat_df (data_df, unique_value_threshold: int):
    """ 
    Output: a new dataframe with columns of categorical variables from the input dataframe.
    Input: 
        data_df: original dataframe, 
        unique_value_threshold(int): number of unique values of each column
    e.g. If we define a column with =<3 unique values as being numerical variable, unique_value_threshold = 3
    """
    cat_mask = data_df.apply(lambda x:x.unique().size <= unique_value_threshold,axis=0) 
    cat_df = data_df[data_df.columns[cat_mask]]
    return cat_df


In [11]:
assert get_cat_df(dfp, 3).columns.size + get_num_df(dfp, 3).columns.size == dfp.columns.size

---

## Appendix

### Tool: Getting summary dataframe

In [12]:
# might not be very useful at this point
def summary_df (data_df):
    """ 
    Output: a new dataframe with summary info from the input dataframe.
    Input: data_df, the original dataframe
    """
    summary_df = pd.concat([(data_df.describe(include='all')), \
           (data_df.dtypes.to_frame(name='dtypes').T), \
           (data_df.isnull().sum().to_frame(name='isnull').T), \
           (data_df.apply(lambda x:x.unique().size).to_frame(name='uniqAll').T)])
    return summary_df

def data_quality_df (data_df):
    """ 
    Output: a new dataframe with summary info from the input dataframe.
    Input: data_df, the original dataframe
    """
    data_quality_df = pd.concat([(data_df.describe(include='all')), \
           (data_df.dtypes.to_frame(name='dtypes').T), \
           (data_df.isnull().sum().to_frame(name='isnull').T), \
           (data_df.apply(lambda x:x.unique().size).to_frame(name='uniqAll').T)])
    return data_quality_df.iloc[[11,13,12,0,],:]


In [13]:
data_quality_df(dfp)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
dtypes,int64,int64,object,object,object,object,int64,float64,float64,float64,...,int64,int64,int64,int64,float64,float64,float64,float64,float64,float64
uniqAll,1000,2,2,2,2,2,6,84,464,774,...,2,2,1,2,3,3,4,11,6,12
isnull,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,132,132,132,132,132,132
count,1000.00,1000.00,1000,1000,1000,1000,1000.00,1000.00,1000.00,1000.00,...,1000.00,1000.00,1000.00,1000.00,868.00,868.00,868.00,868.00,868.00,868.00


In [14]:
summary_df(dfp)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,1000.00,1000.00,1000,1000,1000,1000,1000.00,1000.00,1000.00,1000.00,...,1000.00,1000.00,1000.00,1000.00,868.00,868.00,868.00,868.00,868.00,868.00
unique,,,2,2,2,2,,,,,...,,,,,,,,,,
top,,,Cash loans,F,N,Y,,,,,...,,,,,,,,,,
freq,,,910,627,648,698,,,,,...,,,,,,,,,,
mean,276295.62,0.10,,,,,0.46,171825.61,605110.28,27562.01,...,0.00,0.00,0.00,0.00,0.00,0.01,0.04,0.26,0.26,1.87
std,100352.70,0.29,,,,,0.75,95650.51,396188.03,14645.29,...,0.06,0.03,0.00,0.03,0.05,0.09,0.20,0.86,0.60,1.84
min,100148.00,0.00,,,,,0.00,31500.00,45000.00,4275.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
25%,194298.00,0.00,,,,,0.00,112500.00,276375.38,16713.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
50%,276804.00,0.00,,,,,0.00,157500.00,521280.00,25447.50,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
75%,360688.25,0.00,,,,,1.00,202500.00,807984.00,35607.38,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,3.00


---

### .nunique() function

In [15]:
# nunique() function excludes NaN 
# i.e. it does not consider NaN as a "value", therefore NaN is not counted as a "unique value"
dfp.nunique()

SK_ID_CURR                    1000
TARGET                           2
NAME_CONTRACT_TYPE               2
CODE_GENDER                      2
FLAG_OWN_CAR                     2
                              ... 
AMT_REQ_CREDIT_BUREAU_DAY        2
AMT_REQ_CREDIT_BUREAU_WEEK       3
AMT_REQ_CREDIT_BUREAU_MON       10
AMT_REQ_CREDIT_BUREAU_QRT        5
AMT_REQ_CREDIT_BUREAU_YEAR      11
Length: 122, dtype: int64

In [16]:
dfp.nunique() == dfp.apply(lambda x:x.unique().shape[0])

SK_ID_CURR                     True
TARGET                         True
NAME_CONTRACT_TYPE             True
CODE_GENDER                    True
FLAG_OWN_CAR                   True
                              ...  
AMT_REQ_CREDIT_BUREAU_DAY     False
AMT_REQ_CREDIT_BUREAU_WEEK    False
AMT_REQ_CREDIT_BUREAU_MON     False
AMT_REQ_CREDIT_BUREAU_QRT     False
AMT_REQ_CREDIT_BUREAU_YEAR    False
Length: 122, dtype: bool

In [17]:
dfp['AMT_REQ_CREDIT_BUREAU_YEAR'].unique().shape[0]

12

In [18]:
dfp['AMT_REQ_CREDIT_BUREAU_YEAR'].nunique()

11

In [19]:
dfp['AMT_REQ_CREDIT_BUREAU_YEAR'].unique().size

12

### .value_counts() function

In [20]:
# .value_counts() function has similar viewpoint towards NaN.
# i.e. it does not consider null as a value, therefore not counted in .value_counts()

In [21]:
dfp['AMT_REQ_CREDIT_BUREAU_YEAR'].value_counts()

0.00     241
1.00     198
2.00     172
3.00     109
4.00      70
5.00      33
6.00      27
7.00       8
8.00       7
9.00       2
13.00      1
Name: AMT_REQ_CREDIT_BUREAU_YEAR, dtype: int64

In [22]:
dfp['AMT_REQ_CREDIT_BUREAU_YEAR'].isnull().sum()

132

In [23]:
dfp['AMT_REQ_CREDIT_BUREAU_YEAR'].size

1000

In [24]:
dfp['AMT_REQ_CREDIT_BUREAU_YEAR'].value_counts().sum() + dfp['AMT_REQ_CREDIT_BUREAU_YEAR'].isnull().sum() == \
dfp['AMT_REQ_CREDIT_BUREAU_YEAR'].size

True

### 重複值

In [25]:
# Counting unique values (cf. .nunique() function, see above section)
# This code was retrieved from HT

dfp.apply(lambda x:x.unique().shape[0])

SK_ID_CURR                    1000
TARGET                           2
NAME_CONTRACT_TYPE               2
CODE_GENDER                      2
FLAG_OWN_CAR                     2
                              ... 
AMT_REQ_CREDIT_BUREAU_DAY        3
AMT_REQ_CREDIT_BUREAU_WEEK       4
AMT_REQ_CREDIT_BUREAU_MON       11
AMT_REQ_CREDIT_BUREAU_QRT        6
AMT_REQ_CREDIT_BUREAU_YEAR      12
Length: 122, dtype: int64

In [26]:
# It is the same if you write (dfp.apply(lambda x:x.unique().size))
assert (dfp.apply(lambda x:x.unique().shape[0])==dfp.apply(lambda x:x.unique().size)).all

In [27]:
# # %timeit showed the performances are similar
# %timeit dfp.apply(lambda x:x.unique().shape[0])
# %timeit dfp.apply(lambda x:x.unique().size)