# Understand Variables

Please check [Understand-Variables-in-Chinese](https://www.kaggle.com/fanzzz/understand-variables-in-chinese)

# Import Packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import os
print(os.listdir("../input"))
print('\n')
print(os.listdir("../working"))
print(os.listdir("../"))

    # Settings

In [None]:
### set dataframe display settigns ###
pd.set_option('display.max_columns',1000)
pd.set_option('display.width',1000)
pd.set_option('display.float_format','{:,.2f}'.format)

# Read Data

In [None]:
train = pd.read_csv('../input/application_train.csv')
test = pd.read_csv('../input/application_test.csv')
bureau = pd.read_csv('../input/bureau.csv')
bureau_balance = pd.read_csv('../input/bureau_balance.csv')
previous = pd.read_csv('../input/previous_application.csv')
pos = pd.read_csv('../input/POS_CASH_balance.csv')
installments = pd.read_csv('../input/installments_payments.csv')
cc = pd.read_csv('../input/credit_card_balance.csv')

In [None]:
### Check Data Shape ###
print('train:', train.shape)
print('test:', test.shape)
print('bureau:', bureau.shape)
print('bureau_balance:', bureau_balance.shape)
print('previous:', previous.shape)
print('pos:', pos.shape)
print('installments:', installments.shape)
print('cc:', cc.shape)

# Merge, Concat, Append

In [None]:
train.head()

In [None]:
### Append ###
df = train.append(test)
print(df.shape)
print('Check:', train.shape[0]+test.shape[0]==df.shape[0])

In [None]:
### Concat ###
df1 = pd.concat([train,test],axis=0)
print(df1.shape)

In [None]:
### Merge ###
df2 = train.merge(test,on='SK_ID_CURR',how='outer')
print(df2.shape)

In [None]:
df2.head()

In [None]:
del df1,df2

# Basic Feature Cleaning

**First Step: we do the following simple operation**
* Replace
* Drop
* Absolute

## Define Function

In [None]:
### Define Replace Function ###
def replace(df,col,pre_value,after_value):
    df[col].replace(pre_value,after_value,inplace=True)

In [None]:
### Define Absolute Function ###
def abs_func(df,col):
    df[col] = abs(df[col])

In [None]:
### Define Drop Function ###
def drop(df,col):
    df.drop(columns=[col],axis=1,inplace=True)

In [None]:
### Define KDE plot Function ###
def kde_plot(df,var_name):
    plt.figure(figsize=(10,6))
    sns.kdeplot(df[var_name])
    plt.xlabel(var_name);plt.ylabel('Density');plt.title('%s Distribution'%var_name)

In [None]:
### Defind Tukey IQR Function ###
def find_outliers_tukey(x):
    q1 = np.percentile(x,10)
    q3 = np.percentile(x,90)
    iqr = q3 - q1
    floor = q1 - 1.5*iqr
    ceiling = q3 + 1.5*iqr
    outlier_indices = list(x.index[(x < floor) | (x > ceiling)])
    outlier_values = list(x[outlier_indices])
    return outlier_indices, outlier_values


> ## 1. train & test

In [None]:
### CODE_GENDER ###
df['CODE_GENDER'].value_counts()

In [None]:
#replace(df,'CODE_GENDER','XNA',np.nan)

In [None]:
df['CODE_GENDER'].value_counts()

In [None]:
### AMT_INCOME_TOTAL ###
df['AMT_INCOME_TOTAL'].describe()

In [None]:
df.loc[df['AMT_INCOME_TOTAL']>10000000,'AMT_INCOME_TOTAL']

In [None]:
df.loc[df['AMT_INCOME_TOTAL']>100000000,'AMT_INCOME_TOTAL'] = np.nan

In [None]:
### REGION_POPULATION_RELATIVE ###
df['REGION_POPULATION_RELATIVE'].describe()

In [None]:
### DAYS_BIRTH ###
df['DAYS_BIRTH'].describe()

In [None]:
abs_func(df,'DAYS_BIRTH')

In [None]:
### DAYS_EMPLOYED ###
df['DAYS_EMPLOYED'].describe()

In [None]:
np.percentile(df['DAYS_EMPLOYED'],85)

In [None]:
print(df.shape)
print(len(df[df['DAYS_EMPLOYED']>0]))
print(len(df[df['DAYS_EMPLOYED']==365243]))

In [None]:
kde_plot(df,'DAYS_EMPLOYED')

In [None]:
replace(df,'DAYS_EMPLOYED',365243,np.nan)

In [None]:
df['DAYS_EMPLOYED'].describe()

In [None]:
abs_func(df,'DAYS_EMPLOYED')

In [None]:
df['DAYS_EMPLOYED'].describe()

In [None]:
### DAYS_REGISTRATION ###
abs_func(df,'DAYS_REGISTRATION')

In [None]:
df['DAYS_REGISTRATION'].describe()

In [None]:
### DAYS_ID_PUBLISH ###
abs_func(df,'DAYS_ID_PUBLISH')

In [None]:
### OWN_CAR_AGE ###
df['OWN_CAR_AGE'].describe() # 91 years old car!

In [None]:
kde_plot(df,'OWN_CAR_AGE')

In [None]:
### FLAG_PHONE ###
drop(df,'FLAG_PHONE')

In [None]:
### REGION_RATING_CLIENT_W_CITY ###
df['REGION_RATING_CLIENT_W_CITY'].value_counts()

In [None]:
### 不能删除这一行，是测试集！！！ ###
#df = df[df['REGION_RATING_CLIENT_W_CITY'] != -1]

In [None]:
### ORGANIZATION_TYPE ###
df['ORGANIZATION_TYPE'].value_counts()

In [None]:
### EXT_SOURCE_1 ###
df['EXT_SOURCE_1'].describe()

In [None]:
### APARTMENTS_AVG ###
df['APARTMENTS_AVG'].describe()

In [None]:
### OBS_30_CNT_SOCIAL_CIRCLE ###
df['OBS_30_CNT_SOCIAL_CIRCLE'].describe()

In [None]:
print(df.shape)
print(len(df[df['OBS_30_CNT_SOCIAL_CIRCLE']>100]))

In [None]:
df.loc[df['OBS_30_CNT_SOCIAL_CIRCLE']>100,'OBS_30_CNT_SOCIAL_CIRCLE'] = np.nan

In [None]:
df[df['OBS_30_CNT_SOCIAL_CIRCLE']>100]['OBS_30_CNT_SOCIAL_CIRCLE'] 

In [None]:
kde_plot(df,'OBS_30_CNT_SOCIAL_CIRCLE')

In [None]:
### OBS_60_CNT_SOCIAL_CIRCLE ###
df['OBS_60_CNT_SOCIAL_CIRCLE'].describe()

In [None]:
df.loc[df['OBS_60_CNT_SOCIAL_CIRCLE']>100,'OBS_60_CNT_SOCIAL_CIRCLE']

In [None]:
df.loc[df['OBS_60_CNT_SOCIAL_CIRCLE']>100,'OBS_60_CNT_SOCIAL_CIRCLE'] = np.nan

In [None]:
### DAYS_LAST_PHONE_CHANGE ###
abs_func(df,'DAYS_LAST_PHONE_CHANGE')

In [None]:
### AMT_REQ_CREDIT_BUREAU_QRT ###
df['AMT_REQ_CREDIT_BUREAU_QRT'].describe()

In [None]:
df.loc[df['AMT_REQ_CREDIT_BUREAU_QRT']>10,'AMT_REQ_CREDIT_BUREAU_QRT']

In [None]:
kde_plot(df,'AMT_REQ_CREDIT_BUREAU_QRT')

In [None]:
df.loc[df['AMT_REQ_CREDIT_BUREAU_QRT']>100,'AMT_REQ_CREDIT_BUREAU_QRT'] = np.nan

In [None]:
df.to_csv('df.csv',index=False)

## bureau

In [None]:
### DAYS_CREDIT ###
abs_func(bureau,'DAYS_CREDIT')

In [None]:
### CREDIT_DAY_OVERDUE ###
bureau['CREDIT_DAY_OVERDUE'].describe()

In [None]:
kde_plot(bureau,'CREDIT_DAY_OVERDUE')

In [None]:
bureau.loc[bureau['CREDIT_DAY_OVERDUE']>1000,'CREDIT_DAY_OVERDUE']

In [None]:
### DAYS_CREDIT_ENDDATE ###
bureau['DAYS_CREDIT_ENDDATE'].describe()

In [None]:
kde_plot(bureau,'DAYS_CREDIT_ENDDATE')

In [None]:
indices, values = find_outliers_tukey(bureau['DAYS_CREDIT_ENDDATE'].fillna(bureau['DAYS_CREDIT_ENDDATE'].mean()))
print(bureau.shape)
print(len(values))
print(values)

In [None]:
len(bureau.loc[bureau['DAYS_CREDIT_ENDDATE']>30000,'DAYS_CREDIT_ENDDATE'])

In [None]:
len(bureau.loc[bureau['DAYS_CREDIT_ENDDATE']<-10000,'DAYS_CREDIT_ENDDATE'])

In [None]:
bureau.loc[bureau['DAYS_CREDIT_ENDDATE']<-20000,'DAYS_CREDIT_ENDDATE']

In [None]:
bureau.loc[bureau['DAYS_CREDIT_ENDDATE']<-20000,'DAYS_CREDIT_ENDDATE'] = np.nan

In [None]:
### DAYS_ENDDATE_FACT ###
bureau['DAYS_ENDDATE_FACT'].describe()

In [None]:
bureau.loc[bureau['DAYS_ENDDATE_FACT']<-4000,'DAYS_ENDDATE_FACT']

In [None]:
indices, values = find_outliers_tukey(bureau['DAYS_ENDDATE_FACT'].fillna(bureau['DAYS_ENDDATE_FACT'].mean()))
print(bureau.shape)
print(len(values))
print(values)

In [None]:
bureau.loc[bureau['DAYS_ENDDATE_FACT']<-4000,'DAYS_ENDDATE_FACT'] = np.nan

In [None]:
### AMT_CREDIT_MAX_OVERDUE ###
bureau['AMT_CREDIT_MAX_OVERDUE'].describe()

In [None]:
np.percentile(bureau['AMT_CREDIT_MAX_OVERDUE'].fillna(bureau['AMT_CREDIT_MAX_OVERDUE'].mean()),97)

In [None]:
len(bureau.loc[bureau['AMT_CREDIT_MAX_OVERDUE']>10000000,'AMT_CREDIT_MAX_OVERDUE'])

In [None]:
bureau.loc[bureau['AMT_CREDIT_MAX_OVERDUE']>10000000,'AMT_CREDIT_MAX_OVERDUE'] = np.nan

In [None]:
### AMT_CREDIT_SUM ###
bureau['AMT_CREDIT_SUM'].describe()

In [None]:
np.percentile(bureau['AMT_CREDIT_SUM'].fillna(bureau['AMT_CREDIT_SUM'].mean()),95)

In [None]:
len(bureau.loc[bureau['AMT_CREDIT_SUM']>10000000,'AMT_CREDIT_SUM'])

In [None]:
bureau.loc[bureau['AMT_CREDIT_SUM']>10000000,'AMT_CREDIT_SUM'] = np.nan

In [None]:
### AMT_CREDIT_SUM_DEBT ###
bureau['AMT_CREDIT_SUM_DEBT'].describe()

In [None]:
len(bureau.loc[bureau['AMT_CREDIT_SUM_DEBT']<0,'AMT_CREDIT_SUM_DEBT'])

In [None]:
bureau.loc[bureau['AMT_CREDIT_SUM_DEBT']<0,'AMT_CREDIT_SUM_DEBT'] = 0

In [None]:
np.percentile(bureau['AMT_CREDIT_SUM_DEBT'].fillna(bureau['AMT_CREDIT_SUM_DEBT'].mean()),95)

In [None]:
len(bureau.loc[bureau['AMT_CREDIT_SUM_DEBT']>50000000,'AMT_CREDIT_SUM_DEBT'])

In [None]:
bureau.loc[bureau['AMT_CREDIT_SUM_DEBT']>50000000,'AMT_CREDIT_SUM_DEBT'] = np.nan

In [None]:
### AMT_CREDIT_SUM_LIMIT ###
bureau['AMT_CREDIT_SUM_LIMIT'].describe()

In [None]:
### AMT_ANNUITY ###
bureau['AMT_ANNUITY'].describe()

In [None]:
len(bureau.loc[bureau['AMT_ANNUITY']>10000000,'AMT_ANNUITY'])

In [None]:
bureau.loc[bureau['AMT_ANNUITY']>10000000,'AMT_ANNUITY'] = np.nan

In [None]:
bureau.to_csv('bureau.csv',index=False)

## bureau_balance

In [None]:
### MONTHS_BALANCE ###
abs_func(bureau_balance,'MONTHS_BALANCE')

In [None]:
### Save DataFrame ###
bureau_balance.to_csv('bureau_balance.csv',index=False)

## previous

In [None]:
### AMT_DOWN_PAYMENT ###
previous['AMT_DOWN_PAYMENT'].describe()

In [None]:
previous.loc[previous['AMT_DOWN_PAYMENT']<0,'AMT_DOWN_PAYMENT'] = 0

In [None]:
### NFLAG_MICRO_CASH ###
# No such Variable

In [None]:
### NAME_CASH_LOAN_PURPOSE ###
previous['NAME_CASH_LOAN_PURPOSE'].value_counts()

In [None]:
### DAYS_DECISION ###
abs_func(previous,'DAYS_DECISION')

In [None]:
### CODE_REJECT_REASON ###
previous['CODE_REJECT_REASON'].value_counts()

In [None]:
### NAME_CLIENT_TYPE ###
previous['NAME_CLIENT_TYPE'].value_counts()

In [None]:
### SELLERPLACE_AREA ###
previous['SELLERPLACE_AREA'].describe()

In [None]:
print(len(previous.loc[previous['SELLERPLACE_AREA']>100000,'SELLERPLACE_AREA']))
print(len(previous.loc[previous['SELLERPLACE_AREA']==0,'SELLERPLACE_AREA']))

In [None]:
previous.loc[previous['SELLERPLACE_AREA']<0,'SELLERPLACE_AREA'] = 0

In [None]:
kde_plot(previous,'SELLERPLACE_AREA')

In [None]:
### DAYS_FIRST_DRAWING ###
previous['DAYS_FIRST_DRAWING'].describe()

In [None]:
len(previous.loc[previous['DAYS_FIRST_DRAWING']==365243,'DAYS_FIRST_DRAWING'])

In [None]:
previous.loc[previous['DAYS_FIRST_DRAWING']==365243,'DAYS_FIRST_DRAWING'] = np.nan

In [None]:
abs_func(previous,'DAYS_FIRST_DRAWING')

In [None]:
### DAYS_FIRST_DUE ###
previous['DAYS_FIRST_DUE'].describe()

In [None]:
print(len(previous.loc[previous['DAYS_FIRST_DUE']==365243,'DAYS_FIRST_DUE']))
print(len(previous.loc[(previous['DAYS_FIRST_DUE']<365243)&(previous['DAYS_FIRST_DUE']>0),'DAYS_FIRST_DUE']))

In [None]:
previous.loc[previous['DAYS_FIRST_DUE']==365243,'DAYS_FIRST_DUE'] = np.nan

In [None]:
abs_func(previous,'DAYS_FIRST_DUE')

In [None]:
### DAYS_LAST_DUE_1ST_VERSION ###
previous.loc[previous['DAYS_LAST_DUE_1ST_VERSION']==365243,'DAYS_LAST_DUE_1ST_VERSION'] = np.nan
abs_func(previous,'DAYS_LAST_DUE_1ST_VERSION')

In [None]:
### DAYS_LAST_DUE ###
previous['DAYS_LAST_DUE'].describe()

In [None]:
print(len(previous.loc[(previous['DAYS_LAST_DUE']<365243)&(previous['DAYS_LAST_DUE']>0),'DAYS_LAST_DUE']))

In [None]:
previous.loc[previous['DAYS_LAST_DUE']==365243,'DAYS_LAST_DUE'] = np.nan
abs_func(previous,'DAYS_LAST_DUE')

In [None]:
### DAYS_TERMINATION ###
previous['DAYS_TERMINATION'].describe()

In [None]:
previous.loc[previous['DAYS_TERMINATION']==365243,'DAYS_TERMINATION'] = np.nan
abs_func(previous,'DAYS_TERMINATION')

In [None]:
previous.to_csv('previous.csv',index=False)

## pos

In [None]:
### MONTHS_BALANCE ###
abs_func(pos,'MONTHS_BALANCE')

In [None]:
### Save DataFrame ###
pos.to_csv('pos.csv',index=False)

> ## installments

In [None]:
### NUM_INSTALMENT_VERSION ###
installments['NUM_INSTALMENT_VERSION'].describe()

In [None]:
np.percentile(installments['NUM_INSTALMENT_VERSION'],90)

In [None]:
len(installments.loc[installments['NUM_INSTALMENT_VERSION']>100,'NUM_INSTALMENT_VERSION'])

In [None]:
installments.loc[installments['NUM_INSTALMENT_VERSION']>50,'NUM_INSTALMENT_VERSION']

In [None]:
kde_plot(installments,'NUM_INSTALMENT_VERSION')

In [None]:
### DAYS_INSTALMENT ###
abs_func(installments,'DAYS_INSTALMENT')

In [None]:
### DAYS_ENTRY_PAYMENT ###
abs_func(installments,'DAYS_ENTRY_PAYMENT')

In [None]:
### AMT_INSTALMENT ###
installments['AMT_INSTALMENT'].describe()

In [None]:
kde_plot(installments,'AMT_INSTALMENT')

In [None]:
len(installments.loc[installments['AMT_INSTALMENT']>1000000,'AMT_INSTALMENT'])

In [None]:
### Save DataFrame ###
installments.to_csv('installments.csv',index=False)

## cc

In [None]:
### MONTHS_BALANCE ###
abs_func(cc,'MONTHS_BALANCE')

In [None]:
### AMT_BALANCE ###
cc['AMT_BALANCE'].describe()

In [None]:
len(cc.loc[cc['AMT_BALANCE']<0,'AMT_BALANCE'])

In [None]:
kde_plot(cc,'AMT_BALANCE')

In [None]:
### AMT_DRAWINGS_ATM_CURRENT ###
cc['AMT_DRAWINGS_ATM_CURRENT'].describe()

In [None]:
len(cc.loc[cc['AMT_DRAWINGS_ATM_CURRENT']<0,'AMT_DRAWINGS_ATM_CURRENT'])

In [None]:
cc.loc[cc['AMT_DRAWINGS_ATM_CURRENT']<0,'AMT_DRAWINGS_ATM_CURRENT'] = np.nan

In [None]:
kde_plot(cc,'AMT_DRAWINGS_ATM_CURRENT')

In [None]:
### AMT_DRAWINGS_CURRENT ###
cc['AMT_DRAWINGS_CURRENT'].describe()

In [None]:
len(cc.loc[cc['AMT_DRAWINGS_CURRENT']<0,'AMT_DRAWINGS_CURRENT'])

In [None]:
cc.loc[cc['AMT_DRAWINGS_CURRENT']<0,'AMT_DRAWINGS_CURRENT'] = np.nan

In [None]:
### AMT_RECEIVABLE_PRINCIPAL ###
cc['AMT_RECEIVABLE_PRINCIPAL'].describe()

In [None]:
len(cc.loc[cc['AMT_RECEIVABLE_PRINCIPAL']<0,'AMT_RECEIVABLE_PRINCIPAL'])

In [None]:
kde_plot(cc,'AMT_RECEIVABLE_PRINCIPAL')

In [None]:
#cc.loc[cc['AMT_RECEIVABLE_PRINCIPAL']<0,'AMT_RECEIVABLE_PRINCIPAL'] = np.nan

In [None]:
### AMT_RECIVABLE ###
print(len(cc.loc[cc['AMT_RECIVABLE']<0,'AMT_RECIVABLE']))
#cc.loc[cc['AMT_RECIVABLE']<0,'AMT_RECIVABLE'] = np.nan

In [None]:
### Sava DataFrame ###
cc.to_csv('cc.csv',index=False)