In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clickingq run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

这个notebook尝试对输入特征进行特征工程

# 1 读入数据
## 1.1 概览

In [None]:
train = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv',index_col=0)
test = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv',index_col=0)
sample = pd.read_csv('/kaggle/input/GiveMeSomeCredit/sampleEntry.csv')

train.shape, test.shape, sample.shape

In [None]:
# 简写列名
train.columns = test.columns = ['SeriousDlqin2yrs', 'revolve', 'age','30-59', 'DebtRatio', 'MonthlyIncome',
       'NofCredit', '90+','NofRealLoan', '60-89','NofDependents']
train.describe()

## 1.2 离群点检测

In [None]:
from collections import Counter

# Outlier detection 
# 任何超过n个属性离群的样本点将被筛选出来

def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare
Outliers_to_drop = detect_outliers(train,2,["revolve","DebtRatio","MonthlyIncome"])

In [None]:
train.loc[Outliers_to_drop] # Show the outliers rows

In [None]:
# Drop outliers
train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

## 1.3 切分Xy， 联合处理test和train

In [None]:
X_train = train.iloc[:,1:]
y_train = train.iloc[:,0]
X_test = test.iloc[:,1:]
X_train.shape, y_train.shape, X_test.shape

In [None]:
X_all = pd.concat([X_train, X_test],axis=0).reset_index(drop=True)
X_all

## 1.4 处理缺失
有缺失值的属性是
* 收入 MonthlyIncome 
* 赡养人数 NumberOfDependents

In [None]:
X_all.info()
X_all.isnull().sum()

* 要通过预测补全缺失的income，将其他属性作为训练集
* 但是赡养人数有空缺，删除这个特征的空行，可以得到去除income全部已知的训练集


In [None]:
from sklearn.neighbors import KNeighborsRegressor

# 获得all_knwon 
# 用来构造不同填充对象的训练集
all_known_index = list(X_all.NofDependents.notnull().values) and list(X_all.MonthlyIncome.notnull().values)
all_known = X_all[all_known_index]

all_known.isnull().sum()

In [None]:
#构造填充income的训练集 X,y
X_train_income = all_known.drop(['MonthlyIncome'],axis=1).values
y_train_income = all_known['MonthlyIncome'].values

#待预测集X 
income_unknown = X_all[X_all.MonthlyIncome.isnull()]
X_test_income = income_unknown.drop(['MonthlyIncome'],axis=1)
# KNN 的 X_test 不能有nan, 用均值填充
X_test_income = X_test_income.fillna(X_all['NofDependents'].mean()).values 

In [None]:
X_train_income.shape, y_train_income.shape, X_test_income.shape

In [None]:
knn = KNeighborsRegressor()
knn.fit(X_train_income, y_train_income)
y_pred_income = knn.predict(X_test_income)
min(y_pred_income), max(y_pred_income)

In [None]:
X_all.loc[X_all.MonthlyIncome.isnull(), 'MonthlyIncome'] = y_pred_income
X_all.isnull().sum()

如法炮制填充 NofDpendents

In [None]:
#构造填充depend的训练集 X,y
X_train_depend = all_known.drop(['NofDependents'],axis=1).values
y_train_depend = all_known['NofDependents'].values

#待预测集X 
depend_unknown = X_all[X_all.NofDependents.isnull()]
X_test_depend = depend_unknown.drop(['NofDependents'],axis=1).values

#预测出y
knn = KNeighborsRegressor()
knn.fit(X_train_depend, y_train_depend)
y_pred_depend = knn.predict(X_test_depend)

#填回去
X_all.loc[X_all.NofDependents.isnull(), 'NofDependents'] = y_pred_depend
X_all.isnull().sum()

In [None]:
X_train[['MonthlyIncome','NofDependents']] = \
    X_all.loc[:X_train.shape[0],['MonthlyIncome','NofDependents']]
train[['MonthlyIncome','NofDependents']] =  X_train[['MonthlyIncome','NofDependents']]

X_test[['MonthlyIncome','NofDependents']] = \
    X_all.loc[-X_test.shape[0]:,['MonthlyIncome','NofDependents']]
test[['MonthlyIncome','NofDependents']] =  X_test[['MonthlyIncome','NofDependents']]

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train.isnull().sum(), X_test.isnull().sum()

# 2 特征观察

In [None]:
train.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#相关系数矩阵

g = sns.heatmap(
    train.corr(),
    annot=True, 
    fmt = ".2f", 
    cmap = "coolwarm"
)

* 几个负债日期相关性较大
* 年龄 几个负债时间属性 对结果影响较大
* NofCredit 和 NofRealLoan 本身相关性大

In [None]:
X_train = X_train.drop(['revolve','90+','60-89'],axis=1) # 冗余去除 无关去除
X_test = X_test.drop(['revolve','90+','60-89'],axis=1)

In [None]:
g = sns.heatmap(
    X_train.corr(),
    annot=True, 
    fmt = ".2f", 
    cmap = "coolwarm"
)

# 3.上模型

In [None]:
from sklearn.metrics import roc_auc_score
form = []

In [None]:
# split for validation
from sklearn.model_selection import train_test_split

X_learn, X_valid, y_learn, y_valid = train_test_split(X_train, y_train, random_state=0)
X_learn.shape, X_valid.shape, y_learn.shape, y_valid.shape


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid={
        'n_estimators':[300,500,800]
    },
    scoring='roc_auc',
    verbose=3
)

grid.fit(X_learn, y_learn)
for result in grid.cv_results_:
    print(result, grid.cv_results_[result])
grid.best_params_['n_estimators']


In [None]:
clf = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'])
clf.fit(X_learn, y_learn)
y_pred = clf.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

In [None]:
import lightgbm as lgb

lgb_grid = GridSearchCV(
    estimator=lgb.LGBMClassifier(),
    param_grid={
        'n_estimators':range(40,60)
    },
    scoring='roc_auc',
    verbose=1
)

lgb_grid.fit(X_learn, y_learn)
for result in lgb_grid.cv_results_:
    print(result, lgb_grid.cv_results_[result])
lgb_grid.best_params_['n_estimators']


In [None]:
lgb_clf = lgb.LGBMClassifier(n_estimators=lgb_grid.best_params_['n_estimators'])
lgb_clf.fit(X_learn, y_learn)
y_pred = lgb_clf.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

form.append(['LGBMClassifier_{}'.format(lgb_grid.best_params_['n_estimators']),score])


In [None]:
form.sort(key=lambda x:x[1],reverse=True)
fm = pd.DataFrame(form,columns=['Model','AUC'])
fm

# 4 针对最优模型调参

In [None]:
grid = GridSearchCV(
    estimator=lgb.LGBMClassifier(),
    param_grid={
        'n_estimators':range(50,60),
        'learning_rate':[0.05,0.1,0.15,0.2,0.25,0.3]
    },
    scoring='roc_auc',
    verbose=1
)
grid.fit(X_learn,y_learn)
grid.best_params_['n_estimators'], grid.best_params_['learning_rate']

# 5 生成结果 制作文件

In [None]:
clf = lgb.LGBMClassifier(n_estimators=grid.best_params_['n_estimators'],
                         learning_rate=grid.best_params_['learning_rate'])
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]

In [None]:
sample = pd.read_csv('../input/GiveMeSomeCredit/sampleEntry.csv')
sample

In [None]:
sample['Probability'] = y_pred
sample.to_csv('./submit.csv',index=False)
reload = pd.read_csv('./submit.csv')
reload