In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.options.display.max_columns = 999
train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
train

In [None]:
test = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')
test

In [None]:
pos_cash_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv')
pos_cash_balance

In [None]:
previous_app = pd.read_csv('/kaggle/input/home-credit-default-risk/previous_application.csv')

# pos_cash_balance 의 'SK_ID_PREV' 와 bureau 의 ANNUITY 와 곂침

previous_app = previous_app.rename(columns={'SK_ID_PREV': 'SK_ID_PREV_pa', 'AMT_ANNUITY':'AMT_ANNUITY_pa'})
previous_app

# <a id='1'>간접적인 정보 추가방법</a>
* <a id='1'> 1. 통계량 정보 추가 (mean, avg, max, min...)</a>
* <a id='1'> 2. 카테고리형에서 nunique 정보 추가</a>
* <a id='1'> 3. 각 ID 별 count 량 추가 --> 여기선 통계량 정보 추가가 (sum 같은) 비슷한 역할을 할수있음</a>

In [None]:
pos_stats = pos_cash_balance.groupby('SK_ID_CURR').agg(['mean', 'std', 'median', 'max', 'min', 'sum'])
pos_stats

In [None]:
bureau = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau.csv')
bureau

In [None]:
bureau_stats = bureau.groupby('SK_ID_CURR').agg(['mean', 'std', 'median','max', 'min', 'sum'])
bureau_stats

In [None]:
previous_stats = previous_app.groupby('SK_ID_CURR').agg(['mean', 'std', 'median','max', 'min', 'sum'])

# to.prefix 같은걸 써서 중복되는 컬럼이름을 바꿔준다

previous_stats

# <a id='1'>직접적인 정보 추가방법</a>
* <a id='1'> 카테고리형 칼럼추가</a>


In [None]:
previous_nct_direct = previous_app.groupby(['SK_ID_CURR', 'NAME_CONTRACT_TYPE'])['SK_ID_PREV_pa'].agg(len).unstack()
previous_nct_direct

In [None]:
previous_ngc_direct = previous_app.groupby(['SK_ID_CURR', 'NAME_GOODS_CATEGORY'])['SK_ID_PREV_pa'].agg(len).unstack()
previous_ngc_direct

# <a id='1'>칼럼의 갯수를 줄여 차원수 줄이고, RAM 소모량 줄이기 위해 PCA / SVD 사용</a>

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 3)

previous_ngc_pca = pca.fit_transform(previous_ngc_direct.fillna(0))

In [None]:
previous_ngc_pca

In [None]:
pca_previous = pd.DataFrame(previous_ngc_pca, index = previous_ngc_direct.index).add_suffix('_NGC')
pca_previous

In [None]:
previous_nsi_direct = previous_app.groupby(['SK_ID_CURR', 'NAME_SELLER_INDUSTRY'])['SK_ID_PREV_pa'].agg(len).unstack()
previous_nsi_direct

In [None]:
previous_nsi_pca = pca.fit_transform(previous_nsi_direct.fillna(0))

In [None]:
pca_previous_nsi = pd.DataFrame(previous_nsi_pca, index = previous_nsi_direct.index).add_suffix('_NSI')
pca_previous_nsi

In [None]:
pd.options.display.max_columns = 999
alldata = pd.concat([train, test])

# 위 bureau_stats 와 pos_stats, previous_stats 정보 추가

alldata = alldata.join(pos_stats, on = 'SK_ID_CURR')
alldata = alldata.join(bureau_stats, on = 'SK_ID_CURR')
alldata = alldata.join(previous_stats, on = 'SK_ID_CURR')
alldata = alldata.join(previous_nct_direct, on = 'SK_ID_CURR')
alldata = alldata.join(pca_previous_nsi, on = 'SK_ID_CURR')
alldata = alldata.join(pca_previous, on = 'SK_ID_CURR')

alldata

In [None]:
alldata_corr = alldata.corr()['TARGET'].sort_values()

In [None]:

print('Most Positive Correlations:\n', alldata_corr.tail(15))
print('\nMost Negative Correlations:\n', alldata_corr.head(15))

In [None]:
# DAYS_BIRTH의 절대값과 TARGET변수와의 상관계수
alldata['DAYS_BIRTH']=abs(alldata['DAYS_BIRTH'])
#alldata2['DAYS_BIRTH'].corr(alldata2['TARGET'])
alldata

In [None]:
alldata2 = alldata.drop(columns = ['TARGET', 'SK_ID_CURR'])
alldata2

# Train 분포 

In [None]:
train['TARGET'].value_counts()

In [None]:
train['TARGET'].astype(int).plot.hist();

In [None]:
def missing_values_table(df):
    # 전체 결측치 개수 확인
    mis_val=df.isnull().sum()
    
    # 결측치 비중 확인
    mis_val_percent=100*df.isnull().sum()/len(df)
    
    # 결측치 개수 , 결측치 비중 테이블 만들기
    mis_val_table=pd.concat([mis_val, mis_val_percent],axis=1)
    
    # 컬럼 이름바꾸기
    mis_val_table_ren_columns=mis_val_table.rename(columns={0:'Missing Values',1:'% of Total Values'})

    # 결측치 0인 컬럼은 제외하고 정렬
    mis_val_table_ren_columns=mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1]!=0].sort_values('% of Total Values',ascending=False).round(1)

    # 요약 결과 print
    print("app_train의 전체 컬럼 개수는 "+str(df.shape[1])+"개 이다.\n"
         "그 중에서 결측치가 있는 컬럼 개수는 "+str(mis_val_table_ren_columns.shape[0])+'개 이다.')
    
    return mis_val_table_ren_columns


In [None]:
missing_values=missing_values_table(train)
missing_values.head(20)

# Label encoding vs One Hot encoding

The problem with label encoding is that it gives the categories an arbitrary ordering. The value assigned to each of the categories is random and does not reflect any inherent aspect of the category. In the example above, programmer recieves a 4 and data scientist a 1, but if we did the same process again, the labels could be reversed or completely different. The actual assignment of the integers is arbitrary. Therefore, when we perform label encoding, the model might use the relative value of the feature (for example programmer = 4 and data scientist = 1) to assign weights which is not what we want. If we only have two unique values for a categorical variable (such as Male/Female), then label encoding is fine, but for more than 2 unique categories, one-hot encoding is the safe option.

There is some debate about the relative merits of these approaches, and some models can deal with label encoded categorical variables with no issues. Here is a good Stack Overflow discussion. I think (and this is just a personal opinion) for categorical variables with many classes, one-hot encoding is the safest approach because it does not impose arbitrary values to categories. The only downside to one-hot encoding is that the number of features (dimensions of the data) can explode with categorical variables with many categories. To deal with this, we can perform one-hot encoding followed by PCA or other dimensionality reduction methods to reduce the number of dimensions (while still trying to preserve information).

In this notebook, we will use Label Encoding for any categorical variables with only 2 categories and One-Hot Encoding for any categorical variables with more than 2 categories. This process may need to change as we get further into the project, but for now, we will see where this gets us. (We will also not use any dimensionality reduction in this notebook but will explore in future iterations).

In [None]:
# from sklearn.preprocessing import LabelEncoder

# le=LabelEncoder()

# c = alldata.columns[alldata.dtypes == object]

# for i in c:
#     alldata[i] = le.fit_transform(list(alldata[i]))


In [None]:
# # One Hot Encoding

# alldata = pd.get_dummies(alldata)

In [None]:
# pd.options.display.max_columns = 999

# alldata

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

c = alldata2.columns[alldata2.dtypes == object]

for i in c :
    alldata2[i] = le.fit_transform(alldata2[i])

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# # 결측치를 median값으로 처리
# imputer = SimpleImputer(strategy='median')

# '''
# 각 Feature의 값을 일정한 범위 또는 규칙에 따르게 하기 위해서 스케일링을 사용
# '''
# # 각각의 변수를 0~1 사이의 값으로 만들어주는 MinMaxScaler 사용
# ## MinMaxScaler 클래스의 인스턴스를 만들어준다
scaler=MinMaxScaler(feature_range=(0,1))

# # training 데이터에 fit
# imputer.fit(alldata2)

# # training데이터와 testing데이터에 둘다 transform
# ## imputer 처리 하고나면 DataFrame에서 array형태로 바뀜
# alldata=imputer.transform(alldata2)

# Scaling
scaler.fit(alldata2)
alldata=scaler.transform(alldata2)

print('Alldata shape: ', alldata2.shape)

In [None]:
train2 = alldata2[:len(train)]
test2 = alldata2[len(train):]

# <a id='1'> RandomForest 대신 Catboost 사용 --> 카테고리형 컬럼이 많기때문에 훨씬 좋은 성능을 보임</a>
# <a id='1'> + 교차검증 사용</a>

In [None]:
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier

cbc = CatBoostClassifier(verbose = 50, task_type = 'GPU', iterations = 10000, learning_rate = 0.1, eval_metric = 'AUC')

skf = StratifiedKFold(n_splits = 5, shuffle= True, random_state = 42)

result = 0
best_score = 0

for train_index, valid_index in skf.split(train2, train['TARGET']):
    x_train, x_valid = train2.iloc[train_index], train2.iloc[valid_index]
    y_train, y_valid = train['TARGET'].iloc[train_index], train['TARGET'].iloc[valid_index]
    cbc.fit(x_train, y_train, eval_set = (x_valid, y_valid), early_stopping_rounds = 25)
    best_score += cbc.best_score_['validation']['AUC'] / 5
    result += cbc.predict_proba(test2) /5
    

In [None]:
best_score

# Imputer, MinMaxScaler 사용 + direct + indirect info added = 0.7778919041156769
# 위에서 Imputer 제거 --> 0.7779257774353028


# <a id='1'> SHAP 적용</a>


In [None]:
import shap
shap.initjs()

In [None]:
X_sampled = x_train.sample(1000, random_state = 42)

In [None]:
explainer = shap.TreeExplainer(cbc)
shap_values = explainer.shap_values(X_sampled)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[0,:], X_sampled.iloc[0,:])

In [None]:
shap.force_plot(explainer.expected_value, shap_values, x_train)

In [None]:
shap.summary_plot(shap_values, X_sampled)

In [None]:
shap.summary_plot(shap_values, X_sampled, plot_type = 'bar')

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# rf=RandomForestClassifier(n_estimators=100, random_state=50, verbose=1, n_jobs=-1)

In [None]:
# rf.fit(train2, train['TARGET'])

# #result = 1- (rf.predict_proba(test2))



# result = rf.predict_proba(test2)

In [None]:
result

In [None]:
result = result[:,1]

In [None]:
result

In [None]:
sub = pd.read_csv('/kaggle/input/home-credit-default-risk/sample_submission.csv')
sub

In [None]:
sub['TARGET'] = result
sub

In [None]:
sub.tail(50)

In [None]:
sub.to_csv('sub1.csv', index= 0)