In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [None]:
print(os.listdir("../input/home-credit-default-risk"))

In [None]:
# Application_traion 데이터 feature확인
app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
print('Training data shape: ', app_train.shape)
app_train.head()

In [None]:
# Testing data features 확인
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
print('Testing data shape: ', app_test.shape)
app_test.head() #target없음

# EDA

### Examine the distribution of the target cloumns

In [None]:
app_train['TARGET'].value_counts()

In [None]:
app_train['TARGET'].value_counts().plot(kind='bar')
#매우 불균형한 데이터

### Examing Missing Values

In [None]:
#null값 비율과 개수 체크 함수
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100*df.isnull().sum()/len(df)
    mis_val_table = pd.concat([mis_val,mis_val_percent],axis=1)
    
    mis_val_table_re_columns = mis_val_table.rename(columns={0:'Missing Values', 1:'% of Total values'})
    mis_val_table_re_columns = mis_val_table_re_columns[mis_val_table_re_columns.iloc[:,1] != 0].sort_values('% of Total values', ascending=False)
    print("Yout selected dataframe has "+str(df.shape[1])+" columns.\n","There are "+str(mis_val_table_re_columns.shape[0])+" columns that have missing values")
    return mis_val_table_re_columns

In [None]:
missing_values = missing_values_table(app_train)
missing_values.head(20)
#많은 값이 null값 비율이 6-70%정도이다.

### column type

In [None]:
app_train.dtypes.value_counts()

In [None]:
app_train.select_dtypes('object').apply(pd.Series.nunique, axis=0)
# object type 중 unique한 개수 파악

### encoding categorical variables

In [None]:
le = LabelEncoder()
le_count = 0

# 유니크한 개수가 2개 이하이면, labelencoder로 인코딩.
# 2개 초과이면, pd.get_dummies로 one-hot-encoding
for col in app_train:
    if app_train[col].dtype == 'object':
        if len(list(app_train[col].unique())) <= 2:
            
            le.fit(app_train[col])
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            le_count+=1
print('%d columns were label encoded '%le_count)

In [None]:
# one-hot-encoding
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

In [None]:
print(app_train.shape)
print(app_test.shape)
# train과 test의 개수가 달라짐. get_dummies과정을 거쳐서 그런듯.

In [None]:
# 달라진 컬럼 확인
app_train.head()

In [None]:
app_test.head()

### Aligning Training and Test data

In [None]:
# 테스트를 기준으로 train을 정렬

train_labels = app_train['TARGET'] # 타겟 데이터 미리 빼놓기

app_train, app_test = app_train.align(app_test, join='inner',axis=1)
app_train['TARGET'] = train_labels

In [None]:
print(app_train.shape)
print(app_test.shape)

### 이상치 anomalies

In [None]:
app_train.describe()

In [None]:
(app_train['DAYS_BIRTH']/-365).describe() 

In [None]:
# 대출 상환과 직접적인 관련이 있을 것 같은 'DAYS_EMPLOYED'살피기
app_train['DAYS_EMPLOYED'].describe() # 뭔가 이상하다.

In [None]:
app_train['DAYS_EMPLOYED'].plot.hist()
plt.xlabel('Days Employment')

In [None]:
app_train['DAYS_EMPLOYED'].value_counts().head() 
# 365243이라는 값이 뭘까

In [None]:
anom = app_train[app_train['DAYS_EMPLOYED']==365243]
non_anom = app_train[app_train['DAYS_EMPLOYED']!=365243]
print("the non-anomalies default on %0.2f%% of loans " %(100*non_anom['TARGET'].mean()))
print("the anomalies default on %0.2f%% of loans" %(100*anom['TARGET'].mean()))
print("there are %d anomalous days of employment" %(len(anom)))
# 365243인 값들만 빼서 target과의 연관성을 보았다.
# 이상치가 있는 경우 없는 경우보다 loan평균이 더 낮다. 즉, 상환율이 더 좋다. 

In [None]:
(len(non_anom) / len(app_train))*100

### 이상치 값들을 DAYS_EMPLOYED_ANOM컬럼으로 따로 빼주기
이상치가 있는 경우 상환율이 더 좋기 때문에 유의미하다고 느껴진다.

In [None]:
app_train['DAYS_EMPLOYED_ANOM'] = app_train['DAYS_EMPLOYED'] == 365243

In [None]:
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)
app_train['DAYS_EMPLOYED'].plot.hist(title="Days Employment Histogram")
plt.xlabel('Days Employment')
# 365243인 값들을np.nan(NaN값)으로 넣어주고 hist를 보니 정상적인 히스토그램 출력

In [None]:
# app_test데이터도 
app_test['DAYS_EMPLOYED_ANOM'] = app_test['DAYS_EMPLOYED'] == 365243
app_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)
print('There are %d anomalies in the test data out of %d entries' %(app_test['DAYS_EMPLOYED_ANOM'].sum(),len(app_test)))

### 연관성 correlation

In [None]:
# TARGET과의 연관성 보기
correlations = app_train.corr()['TARGET'].sort_values()

print('most positive Corr \n', correlations.tail(15))
print('most negative Corr \n', correlations.head(15))
# 긍정 연관성에는 DAYS_BIRTH 등, 부정 연관성에는 EXT_SOURCE_1,2,3 등

In [None]:
# DAYS_BIRTH

app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH']) # 절대값 취하기
app_train['DAYS_BIRTH'].corr(app_train['TARGET']) # 절대값 취하고 보니 negative

In [None]:
app_train['DAYS_BIRTH'].head()

In [None]:
# 연령대별 빈도수 히스토그램으로 보기
plt.style.use('fivethirtyeight')
plt.hist(app_train['DAYS_BIRTH']/365, edgecolor='k', bins=25)

In [None]:
plt.figure(figsize=(10,8))
sns.kdeplot(app_train.loc[app_train['TARGET']==0, 'DAYS_BIRTH']/365, label='target==0') #파랑
sns.kdeplot(app_train.loc[app_train['TARGET']==1, 'DAYS_BIRTH']/365, label='target==1') #주황
plt.xlabel('Age(years)')
plt.ylabel('Density')
plt.title('distribution of age')
#상환못함을 의미하는 주황색 라인을 보면, 연령대가 낮을수록 상환능력이 떨어진다는 것을 알 수 있다.

In [None]:
# 연령대별 구간 나누기 10개로
age_data = app_train[['TARGET','DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH']/365
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins = np.linspace(20,70,num=11))
age_data.head(10)

In [None]:
# 연령대별 TARGET비율
age_groups = age_data.groupby('YEARS_BINNED').mean()
age_groups

In [None]:
plt.figure(figsize=(8,6))
plt.bar(age_groups.index.astype(str), 100*age_groups['TARGET'])

plt.xticks(rotation=75)
plt.xlabel('Age group years')
plt.ylabel('failure to repay')
plt.title("failure to repay by age group")
# 연령대가 어릴수록 상환을 못하는 경우가 많다!

In [None]:
# EXT_SOURCE_1,2,3

ext_data = app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(ext_data_corrs, cmap=plt.cm.RdYlBu_r, vmin=-0.25,annot=True,vmax=0.6)
# EXT_SOURCE_1과 DAYS_BIRTH의 연관성이 높다.

In [None]:
# TARGET에 따른 EXT_SOURCE_1,2,3컬럼 시각화
plt.figure(figsize=(10,12))

for i, source in enumerate(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']):
    plt.subplot(3,1,i+1)
    sns.kdeplot(app_train.loc[app_train['TARGET']==0, source], label='target==0')
    sns.kdeplot(app_train.loc[app_train['TARGET']==1, source], label='target==1')
    
    plt.title('distribution of %s by Traget Value' %source)
    plt.xlabel('%s' %source)
    plt.ylabel('Density')

plt.tight_layout(h_pad=2.5)
# EXT_SOURCE_1,3이 TARGET에 따른 모양이 다르다. 즉, TARGET에 영향을 받는 것 같다.

# bureau데이터를 가지고 모델 만들기
bureau.csv: 고객이 이 곳에서 대출하기 전에 다른 금융 기관에서 대출했던 내역에 관한 데이터

In [None]:
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau.head()

In [None]:
bureau.shape

In [None]:
bureau.info()

In [None]:
# SK_ID_CURR를 기준으로 SK_ID_BUREAU값이 몇 번 나왔는지 보기
bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().head()

In [None]:
# 컬럼명 변경 더 직관적으로.. SK_ID_BUREAU -> previous_loan_counts
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns={'SK_ID_BUREAU':'previous_loan_counts'})
previous_loan_counts.head()

In [None]:
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
train = train.merge(previous_loan_counts, on='SK_ID_CURR', how='left')
train['previous_loan_counts'] = train['previous_loan_counts'].fillna(0)
print(train['previous_loan_counts'].isnull().sum())

In [None]:
train.head()

### seaborn의 kdeplot이용하여 해당 컬럼의 밀도 보기

##### .iloc
integer position을 통해 값을 찾을 수 있다. 라벨X
##### .loc
라벨을 통해, integer positionX

In [None]:
def kde_target(var_name, df):
    #corr 구하기
    corr = df['TARGET'].corr(df[var_name])
    
    #상환을 한사람과 안한사람의 중간값
    avg_repaid = df.loc[df['TARGET'] == 0, var_name].median()
    avg_not_repaid = df.loc[df['TARGET'] == 1, var_name].median()
    
    #kdeplot그리기
    plt.figure(figsize=(12,6))
    
    sns.kdeplot(df.loc[df['TARGET']==0, var_name], label='TARGET==0')
    sns.kdeplot(df.loc[df['TARGET']==1, var_name], label='TARGET==1')
    
    plt.xlabel(var_name)
    plt.ylabel('density')
    plt.title('%s distibution'%var_name)
    plt.legend()
    print('The correlation between %s and the TARGET is %0.4f' %(var_name, corr))
    print('Median value for loan that was not repaid = %0.4f' %avg_not_repaid)
    print('Median value for loan that was repaid = %0.4f' %avg_repaid)

In [None]:
kde_target('EXT_SOURCE_3', train)

In [None]:
kde_target('previous_loan_counts', train)
# 특별한 것은 찾지 못함..

### Numeric 데이터 보기: Aggregation Numeric Columns

In [None]:
# bureau데이터를 'SK_ID_CURR'별로 묶고 통계값을 본다. 그리고 'SK_ID_BUREAU'는 드랍시킨다.
bureau_agg = bureau.drop(columns=['SK_ID_BUREAU']).groupby('SK_ID_CURR', as_index=False).agg(['count','mean','max','min','sum']).reset_index()
bureau_agg.head()

In [None]:
# 행이 두줄로 되어있다. -> levels[0]/ levels[1]
columns = ['SK_ID_CURR']
for var in bureau_agg.columns.levels[0]:
    print(var)
# max, min등이 나올 수 있는 numeric type columns만 나옴

In [None]:
# 이중컬럼은 불편하기 때문에 '기존 컬럼+level1'이라는 이름으로 컬럼명 추가
for var in bureau_agg.columns.levels[0]:
    if var != 'SK_ID_CURR':
        for stat in bureau_agg.columns.levels[1][:-1]:
            columns.append('bureau_%s_%s' %(var, stat))

In [None]:
bureau_agg.columns

In [None]:
bureau_agg.columns = columns

In [None]:
bureau_agg.head() # 새로운 컬럼 세팅 완료

In [None]:
# train에 합쳐주기
train = train.merge(bureau_agg, on='SK_ID_CURR', how='left')
train.head()

In [None]:
train.shape

In [None]:
# 새롭게 만들어진 컬럼으로 corr값 보기
new_corrs = []
for col in columns:
    corr = train['TARGET'].corr(train[col])
    
    new_corrs.append((col, corr))

In [None]:
new_corrs[:5]

In [None]:
# 연관성이 높은 순서대로 정렬
new_corrs = sorted(new_corrs, key = lambda x : abs(x[1]), reverse=True)
new_corrs[:5]

In [None]:
# 연관성이 가장 높은 'bureau_DAYS_CREDIT_mean'시각화
kde_target('bureau_DAYS_CREDIT_mean', train)

### Function for Numeric Aggregations
위에서 한 과정을 함수로 만들기: 새로운 컬럼 추가

In [None]:
def agg_numeric(df, group_var, df_name):
    for col in df:
        # col이 group_var도 아닌데, SK_ID라는 글자가 있으면 drop
        if col != group_var and 'SK_ID' in col:
            df= df.drop(columns = col)
        
        group_ids = df[group_var] #SK_ID_CURR이나 SK_ID_BUREAU가 올 것이다.
        numeric_df = df.select_dtypes('number') # dtype이 number인 것만 가져오기
        numeric_df[group_var] = group_ids
        
        agg = numeric_df.groupby(group_var).agg(['count','mean','max','min','sum']).reset_index()
        
        columns = [group_var]
        
        for var in agg.columns.levels[0]:
            if var != group_var:
                for stat in agg.columns.levels[1][:-1]:
                    columns.append('%s_%s_%s' %(df_name, var, stat))
        
        agg.columns = columns
        return agg

In [None]:
bureau_agg_new = agg_numeric(bureau.drop(columns = ['SK_ID_BUREAU']), group_var='SK_ID_CURR', df_name='bureau')
bureau_agg_new.head()

In [None]:
bureau_agg.head()
# 비교해보면 함수로 앞서 한 똑같은 과정을 잘 만든 것을 확인할 수 있다. 

### Correlation Function
위에서한 과정을 함수로 만들기 : corr구하기

In [None]:
def target_corrs(df):
    corrs = []
    
    for col in df.columns:
        print(col)
        
        if col != 'TARGET':
            corr = df['TARGET'].corr(df[col])
            corr.append((col,corr))
            
        corrs = sorted(corrs, key=lambda x: abs(x[1]), reverse=True)
        return corrs

### categorical 데이터 보기

In [None]:
# 만약 1번 SK_ID_CURR유저가 loan_type이 3개의 home과 1개의 credit이 있다...이런 것들 카운팅해주는 것
categorical = pd.get_dummies(bureau.select_dtypes('object'))
categorical['SK_ID_CURR'] = bureau['SK_ID_CURR']
categorical.head()

In [None]:
categorical_grouped = categorical.groupby('SK_ID_CURR').agg(['sum','mean'])
categorical_grouped.head()

In [None]:
# 여기서도 열이 levels[0], levels[1]로 나뉨
categorical_grouped.columns.levels[0][:10]

In [None]:
categorical_grouped.columns.levels[1]

In [None]:
# 이중컬럼 대신 새로운 컬럼 만들기
group_var = 'SK_ID_CURR'
columns = []

for var in categorical_grouped.columns.levels[0]:
    
    if var != group_var:
        for stat in ['count','count_norm']:
            columns.append('%s_%s' %(var,stat))

            
categorical_grouped.columns = columns
categorical_grouped.head()

In [None]:
# numerical과는 다르게 index값이 SK_ID_CURR로 들어가있어 right_index와 left_on을 해준다.
train = train.merge(categorical_grouped, left_on='SK_ID_CURR', right_index=True, how='left')
train.head()

In [None]:
# 함수로 만들어주기
def count_categorical(df, group_var, df_name):
    categorical = pd.get_dummies(df.select_dtypes('object'))
    categorical[group_var] = df[group_var]
    
    categorical = categorical.groupby(group_var).agg(['sum','mean'])
    
    column_names = []
    
    for var in categorical.columns.levels[0]:
        for stat in ['count', 'count_norm']:
            column_names.append('%s_%s_%s' %(df_name, var, stat))
            
    categorical.columns = column_names
    
    return categorical

In [None]:
bureau_counts = count_categorical(bureau, group_var='SK_ID_CURR', df_name='bureau')
bureau_counts.head()

### bureau_balance.csv 활용
분할상환 데이터

In [None]:
bureau_balance = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')
bureau_balance.head()

In [None]:
bureau_balance.shape

In [None]:
# 앞서 만든 함수로 bureau_balance에 categorical데이터를 나누어 새로운 컬럼으로 추가 
bureau_balance_counts = count_categorical(bureau_balance, group_var='SK_ID_BUREAU', df_name='bureau_balance')
bureau_balance_counts.head()

In [None]:
bureau_balance_counts.shape

In [None]:
# numeric데이터도 앞서 만든 함수로 처리
bureau_balance_agg = agg_numeric(bureau_balance, group_var='SK_ID_BUREAU', df_name='bureau_balance')
bureau_balance_agg.head()

In [None]:
bureau_balance_agg.shape

In [None]:
# bureau_balance_counts와 bureau_balance_agg합치기
bureau_by_loan = bureau_balance_agg.merge(bureau_balance_counts, right_index=True, left_on='SK_ID_BUREAU', how='outer')
bureau_by_loan.head() # 16+6=22개의 컬럼

In [None]:
bureau_by_loan = bureau_by_loan.merge(bureau[['SK_ID_BUREAU','SK_ID_CURR']], on='SK_ID_BUREAU', how='left')
bureau_by_loan.head()

In [None]:
bureau_balance_client = agg_numeric(bureau_by_loan.drop(columns=['SK_ID_BUREAU']), group_var='SK_ID_CURR', df_name='client')
bureau_balance_client.head()

### 다시 정리해본다.
1. bureau, bureau_balance 데이터 다시 불러서 정리하기
2. 위의 내용 train에 합쳐주기
3. missing value 정리해주기
4. test data
5. 결과예측

In [None]:
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau_balance = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')

In [None]:
bureau_counts = count_categorical(bureau, group_var='SK_ID_CURR', df_name='bureau')
bureau_counts.head()

In [None]:
bureau_agg = agg_numeric(bureau.drop(columns=['SK_ID_BUREAU']), group_var='SK_ID_CURR', df_name='bureau')
bureau_agg.head()

In [None]:
bureau_balance_counts

## train에 합쳐주기

In [None]:
original_features = list(train.columns)
print(len(original_features))

In [None]:
train = train.merge(bureau_counts, on='SK_ID_CURR', how='left')
train.head()

In [None]:
train = train.merge(bureau_agg, on='SK_ID_CURR', how='left')
train.head()

In [None]:
train = train.merge(bureau_balance_client, on='SK_ID_CURR', how='left')
train.head()

In [None]:
new_features = list(train.columns)
print(len(new_features))

### Missing Values 처리

In [None]:
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
missing_train = missing_values_table(train)
missing_train.head(10)

In [None]:
missing_train_vars = list(missing_train.index[missing_train['% of Total Values'] > 90])
len(missing_train_vars)

### Test data 

In [None]:
# Read in the test dataframe
test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

# Merge with the value counts of bureau
test = test.merge(bureau_counts, on = 'SK_ID_CURR', how = 'left')

# Merge with the stats of bureau
test = test.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')

In [None]:
print('Shape of Testing Data: ', test.shape)

In [None]:
train_labels = train['TARGET']

# Align the dataframes, this will remove the 'TARGET' column
train, test = train.align(test, join = 'inner', axis = 1)

train['TARGET'] = train_labels

In [None]:
print('Training Data Shape: ', train.shape)
print('Testing Data Shape: ', test.shape)

In [None]:
missing_test = missing_values_table(test)
missing_test.head(10)

In [None]:
missing_test_vars = list(missing_test.index[missing_test['% of Total Values'] > 90])
len(missing_test_vars)

In [None]:
missing_columns = list(set(missing_test_vars + missing_train_vars))

In [None]:
# Drop the missing columns
train = train.drop(columns = missing_columns)
test = test.drop(columns = missing_columns)
print(train.shape, test.shape)

In [None]:
apps = pd.concat([train,test])
print(apps.shape)

In [None]:
# 데이터 레이블 인코딩
object_columns = apps.dtypes[apps.dtypes=='object'].index.tolist()

for column in object_columns:
    apps[column] = pd.factorize(apps[column])[0]

In [None]:
apps_train = apps[-apps['TARGET'].isnull()]
apps_test = apps[apps['TARGET'].isnull()]
apps.shape, apps_train.shape, apps_test.shape

In [None]:
from sklearn.model_selection import train_test_split

ftr_app = apps_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = app_train['TARGET']

train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
train_x.shape, valid_x.shape

In [None]:
from lightgbm import LGBMClassifier

In [None]:
clf = LGBMClassifier(
        n_jobs=-1,
        n_estimators=1000,
        learning_rate=0.02,
        num_leaves=32,
        subsample=0.8,
        max_depth=12,
        silent=-1,
        verbose=-1
        )

clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
        early_stopping_rounds= 100)

In [None]:
from lightgbm import plot_importance

plot_importance(clf, figsize=(16, 32))

In [None]:
preds = clf.predict_proba(apps_test.drop(['SK_ID_CURR', 'TARGET'], axis=1))[:, 1 ]

In [None]:
app_test['TARGET'] = preds
app_test[['SK_ID_CURR', 'TARGET']].to_csv('apps_baseline.csv', index=False)