Xiao Song 宋骁


references:

https://www.kaggle.com/rikdifos/eda-vintage-analysis  

https://www.kaggle.com/rikdifos/credit-card-approval-prediction-using-ml/

https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment

[王汉生：再论正负样本分布不均衡问题](https://mp.weixin.qq.com/s/y-IEltRsmNdbZGyO2hBrwA)

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import category_encoders as ce
from scikitplot.metrics import plot_roc_curve
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, balanced_accuracy_score, roc_curve, auc, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb

application_record = pd.read_csv('../input/credit-card-approval-prediction/application_record.csv') 
credit_record = pd.read_csv('../input/credit-card-approval-prediction/credit_record.csv')  
data = pd.read_pickle('../input/credit-card-approval-prediction-using-ml/data.pkl')
data['ID'] =  data['ID'].astype('int64')
data['Gender'] =  data['Gender'].astype('int64')
data['Car'] =  data['Car'].astype('int64')
data['Reality'] =  data['Reality'].astype('int64')
data['FLAG_MOBIL'] =  data['FLAG_MOBIL'].astype('int64')
print('data has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
data

In [None]:
plt.rcParams['figure.facecolor'] = 'white'

In [None]:
# The label is highly unbalanced. Positive class occupies 422/422+24712 = 0.0168.
data['target'].value_counts()

In [None]:
def split_dt(dt,col_list):
    '''get test set'''
    y = dt['target']
    y = y.astype('int')
    x = dt[col_list]

    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.25, random_state = 1024)
    print(pd.Series(y_test).value_counts(normalize=True))
    return x_train, x_test, y_train, y_test 

In [None]:
col_list = ['Gender','Reality','ChldNo_1', 'ChldNo_2More','wkphone',
              'gp_Age_high', 'gp_Age_highest', 'gp_Age_low',
       'gp_Age_lowest','gp_worktm_high', 'gp_worktm_highest',
       'gp_worktm_low', 'gp_worktm_medium','occyp_hightecwk', 
              'occyp_officewk','famsizegp_1', 'famsizegp_3more',
       'houtp_Co-op apartment', 'houtp_Municipal apartment',
       'houtp_Office apartment', 'houtp_Rented apartment',
       'houtp_With parents','edutp_Higher education',
       'edutp_Incomplete higher', 'edutp_Lower secondary','famtp_Civil marriage',
       'famtp_Separated','famtp_Single / not married','famtp_Widow']
x_train, x_test, y_train, y_test = split_dt(data, col_list)

In [None]:
x_train

In [None]:
# First, I won't use any of data augment method and calculate accuracy accore. However, this accuracy is meaningless. Cause when classifiers guess biggest class, the accuracy will be overestimated. As a result, I use balanced_accuracy_score

def classifier_metrics(x_train, y_train, x_test):
    '''print classification evaluation metrics'''
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_test = np.array(x_test)
    model = XGBClassifier(max_depth=12,
                          n_estimators=250,
                          min_child_weight=8, 
                          subsample=0.8, 
                          colsample_bytree=0.8,
                          learning_rate=0.02,    
                          seed=42)

    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    y_pred_proba0 = model.predict_proba(x_test)
    y_pred_proba = y_pred_proba0[:, 1] 
    print('test set accuracy score is {:.5}'.format(accuracy_score(y_test, y_pred)))
    print('test set balanced accuracy score is {:.5}'.format(balanced_accuracy_score(y_test, y_pred)))
    print('test set roc_auc_score is {:.5}'.format(roc_auc_score(y_test, y_pred_proba)))
    print('test set log_loss is {:.5}'.format(log_loss(y_test, y_pred_proba)))
    print('test set precision_score is {:.5}'.format(precision_score(y_test, y_pred)))
    print('test set recall_score is {:.5}'.format(recall_score(y_test, y_pred)))
    print('test set f1_score is {:.5}'.format(f1_score(y_test, y_pred)))
    plot_roc_curve(y_test, y_pred_proba0)
    plt.show()
    
classifier_metrics(x_train, y_train, x_test)

The accuracy score = 0.98, don't be confused with the high score, that just because 98.3% of label is 1. Then we use `balanced_accuracy_score`, it shows that if samples are balanced, the real accuracy is 0.5, which equals random guessing. Next we try to use `SMOTE` to oversample our data. 

On the other h

In [None]:
x_bal, y_bal = SMOTE().fit_sample(x_train, y_train)
pd.Series(y_bal).value_counts()

In [None]:
classifier_metrics(x_bal, y_bal, x_test)

Now experiment using Jiwe Liu’s data augment method to see if the auc roc curve has improved.

In [None]:
def data_augment(x, y, t, col):
    '''data augment using random shuffle'''
    mask = y>0
    x1 = x[mask].copy() # positive samples
    y1 = y[mask].copy() # positive samples
    for i in range(t): # augment t times
        for j in col:
            np.random.shuffle(x1.loc[:,j].values)
        x = x.append(x1)
        y = y.append(y1)
    print(pd.Series(y).value_counts())
    return x,y

In [None]:
x_aug, y_aug = data_augment(x_train, y_train, 40, x_train.columns)

In [None]:
classifier_metrics(x_aug, y_aug, x_test)

It seems that the effect is not as good as that of smooth, which may be because we have augmented the data encoded by onehot. If it is augmented and then encoded by onehot, the situation may be different.

In [None]:
def feature_eng(data, record):
    '''do some fe'''
    begin_month = pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))
    begin_month = begin_month.rename(columns={'MONTHS_BALANCE':'begin_month'}) 
    new_data = pd.merge(data, begin_month,how="left",on="ID") #merge to record data
    
    record['target'] = None
    record['target'][record['STATUS'] =='2'] = 'Yes' 
    record['target'][record['STATUS'] =='3'] = 'Yes' 
    record['target'][record['STATUS'] =='4'] = 'Yes' 
    record['target'][record['STATUS'] =='5'] = 'Yes' 
    
    target_cnt = record.groupby('ID').count()
    target_cnt['target'][target_cnt['target'] > 0] = 1
    target_cnt['target'][target_cnt['target'] == 0] = 0
    target_cnt = target_cnt[['target']]
    
    new_data['age'] = -(new_data['DAYS_BIRTH']) // 365
    new_data['work_years'] = -(new_data['DAYS_EMPLOYED']) // 365	
    new_data = pd.merge(new_data, target_cnt, how = 'inner', on = 'ID')
    new_data.rename(columns={'CODE_GENDER':'gender','FLAG_OWN_CAR':'car','FLAG_OWN_REALTY':'reality',
                         'CNT_CHILDREN':'child_no','AMT_INCOME_TOTAL':'income',
                         'NAME_EDUCATION_TYPE':'edu_type','NAME_FAMILY_STATUS':'famliy_type',
                         'NAME_HOUSING_TYPE':'house_type','FLAG_EMAIL':'email',
                         'NAME_INCOME_TYPE':'income_type','FLAG_WORK_PHONE':'work_phone',
                         'FLAG_PHONE':'phone','CNT_FAM_MEMBERS':'famliy_size',
                        'OCCUPATION_TYPE':'occupation'
                        },inplace=True)
    
    
    new_data.loc[new_data['income_type'] == 'Pensioner','income_type'] = 'State servant'
    new_data.loc[new_data['income_type'] == 'Student','income_type'] = 'State servant'
    new_data.loc[(new_data['occupation']=='Cleaning staff') | (new_data['occupation']=='Cooking staff') | (new_data['occupation']=='Drivers') | (new_data['occupation']=='Laborers') | (new_data['occupation']=='Low-skill Laborers') | (new_data['occupation']=='Security staff') | (new_data['occupation']=='Waiters/barmen staff'),'occupation']='Laborwk'
    new_data.loc[(new_data['occupation']=='Accountants') | (new_data['occupation']=='Core staff') | (new_data['occupation']=='HR staff') | (new_data['occupation']=='Medicine staff') | (new_data['occupation']=='Private service staff') | (new_data['occupation']=='Realty agents') | (new_data['occupation']=='Sales staff') | (new_data['occupation']=='Secretaries'),'occupation']='officewk'
    new_data.loc[(new_data['occupation']=='Managers') | (new_data['occupation']=='High skill tech staff') | (new_data['occupation']=='IT staff'),'occupation']='hightecwk'
    new_data.loc[new_data['edu_type']=='Academic degree','edu_type'] = 'Higher education'
    
    new_data.drop_duplicates(keep='first', inplace=True)
    new_data.drop(['begin_month', 'DAYS_BIRTH','DAYS_EMPLOYED','FLAG_MOBIL'], axis=1,inplace=True)
    new_data.loc[new_data['work_years'] < 0,'work_years'] = np.nan
    new_data['work_years'].fillna(round(new_data['work_years'].mean()), inplace=True)
    print('data has {} rows and {} columns'.format(new_data.shape[0], new_data.shape[1]))
    return new_data

dt = feature_eng(application_record, credit_record)
dt.to_pickle('data_no_onehot.pkl')
dt.nunique()

In [None]:
dt.columns

In [None]:
col_list = ['gender', 'car', 'reality', 'child_no', 'income', 'income_type',
           'edu_type', 'famliy_type', 'house_type', 'work_phone', 'phone', 'email',
           'occupation', 'famliy_size', 'age', 'work_years']

x_train, x_test, y_train, y_test = split_dt(dt, col_list)

In [None]:
def OneHotEncoder_transfrom(data, col):
    '''try encoding'''
    enc = ce.OneHotEncoder(cols = col).fit(data)
    data2 = enc.transform(data)
    return data2

cat_cols = ['gender', 'car', 'reality', 'income_type', 'edu_type', 'famliy_type', 'house_type', 'work_phone', 'phone', 'email','occupation']
x_org = OneHotEncoder_transfrom(x_train, cat_cols)
x_test_org = OneHotEncoder_transfrom(x_test, cat_cols)

In [None]:
classifier_metrics(x_org, y_train, x_test_org)

In [None]:
x_aug, y_aug = data_augment(x_train, y_train, 40, x_train.columns)
x_aug = OneHotEncoder_transfrom(x_aug, cat_cols)
x_aug_test = OneHotEncoder_transfrom(x_test, cat_cols)
classifier_metrics(x_aug, y_aug, x_aug_test)

In [None]:
def auc_curve(y,prob):
    fpr,tpr,threshold = roc_curve(y,prob) ###计算真正率和假正率
    roc_auc = auc(fpr,tpr) ###计算auc的值
 
    plt.figure(figsize=(6, 4.5))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.3f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
 
    plt.show()

In [None]:
def run_lgb(x_train, y_train,x_test, y_test, n_folds, cat):
    '''train a lightgbm model cv
    '''
    features = list(x_train.columns)

    params = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 16,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 20,
    'learning_rate': 0.1,
    'verbose': 0,
    'nthread': 4,
    }

    y_pred_proba = np.zeros(len(x_test))
    
    for fold, (trn_idx, val_idx) in enumerate(KFold(n_folds).split(x_train, y_train)):
        evals_result = {} 
        print('Training fold {}'.format(fold + 1))
        train_set = lgb.Dataset(x_train.iloc[trn_idx][features], 
                                y_train.iloc[trn_idx], 
                                categorical_feature = cat)
        
        val_set = lgb.Dataset(x_train.iloc[val_idx][features], 
                              y_train.iloc[val_idx], 
                              categorical_feature = cat)

        model = lgb.train(params, train_set, 
                          num_boost_round = 300, 
                          early_stopping_rounds = 30, 
                          valid_sets = [train_set, val_set], 
                          evals_result = evals_result,
                          verbose_eval = 50)

        y_pred_proba += model.predict(x_test[features], 
                                    num_iteration = model.best_iteration)  / n_folds 
        y_pred = np.where(y_pred_proba>0.413, 1, 0)

        print('-' * 50)
        print('\n')
    #print(y_pred)
    print('test set accuracy score is {:.5}'.format(accuracy_score(y_test, y_pred)))
    print('test set balanced accuracy score is {:.5}'.format(balanced_accuracy_score(y_test, y_pred)))
    print('test set roc_auc_score is {:.5}'.format(roc_auc_score(y_test, y_pred_proba)))
    print('test set log_loss is {:.5}'.format(log_loss(y_test, y_pred_proba)))
    print('test set precision_score is {:.5}'.format(precision_score(y_test, y_pred)))
    print('test set recall_score is {:.5}'.format(recall_score(y_test, y_pred)))
    print('test set f1_score is {:.5}'.format(f1_score(y_test, y_pred)))
    auc_curve(y_test, y_pred_proba)
    plt.show()
    return y_pred
 
_ = run_lgb(x_aug, y_aug, x_aug_test, y_test, 5, [])

In [None]:
x_aug, y_aug = data_augment(x_train, y_train, 40, x_train.columns)

In [None]:
cat_cols = ['gender', 'car', 'reality', 'income_type', 'edu_type', 'famliy_type', 'house_type', 'work_phone', 'phone', 'email','occupation']
    
def OrdinalEncoder_transfrom(data, col):
    '''try encoding'''
    enc = ce.OrdinalEncoder(cols = col).fit(data)
    data2 = enc.transform(data)
    return data2

x_aug = OrdinalEncoder_transfrom(x_aug, cat_cols)
x_test = OrdinalEncoder_transfrom(x_test, cat_cols)

In [None]:
_ = run_lgb(x_aug, y_aug, x_test, y_test, 5, cat_cols)