In [None]:
# 导入包
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import seaborn as sns 
import matplotlib.pyplot as plt

# 导入数据
dat = pd.read_csv('../input/german-credit-data-with-risk/german_credit_data.csv',index_col=0)

In [None]:
# 安装评分卡模型算法包
!pip install scorecardpy

In [None]:
# 导入评分卡模型算法包
import scorecardpy as sc

In [None]:
# 数据
dat.head()

In [None]:
# 特征筛选并将目标变量变成“0/1”
dt_s = sc.var_filter(dat, y="Risk")

In [None]:
# 处理后的数据
dt_s.head()

In [None]:
# 将数据分成自变量X和目标变量y
X = dt_s.loc[:,dt_s.columns != 'Risk']
y = dt_s.loc[:,dt_s.columns == 'Risk']

In [None]:
# 将数据分为Train和Test(7:3)
train, test = sc.split_df(dt_s, 'Risk').values()

In [None]:
# 分别看下数据大小
print(train.shape)
print(test.shape)

In [None]:
# 分箱
bins = sc.woebin(dt_s, y="Risk")

In [None]:
# 用图形展示每个变量的分箱结果
sc.woebin_plot(bins)

In [None]:
# 分别将Train和Test转化为分箱格式
train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)

In [None]:
# 分别将Train/Test分成自变量X和目标变量y
y_train = train_woe.loc[:,'Risk']
X_train = train_woe.loc[:,train_woe.columns != 'Risk']
y_test = test_woe.loc[:,'Risk']
X_test = test_woe.loc[:,train_woe.columns != 'Risk']

In [None]:
# logistic regression建模
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
lr.fit(X_train, y_train)
print(lr.coef_)
print(lr.intercept_)

In [None]:
# 利用LR模型预测样本为坏的概率
train_pred = lr.predict_proba(X_train)[:,1]
test_pred = lr.predict_proba(X_test)[:,1]

In [None]:
# 展示LR模型评估指标：KS\ROC
train_perf = sc.perf_eva(y_train, train_pred, title = "train")
test_perf = sc.perf_eva(y_test, test_pred, title = "test")

In [None]:
# 评分卡
card = sc.scorecard(bins, lr, X_train.columns)
# 评分
train_score = sc.scorecard_ply(train, card, only_total_score=False, print_step=0, replace_blank_na=True, var_kp = None)
test_score = sc.scorecard_ply(test, card, only_total_score=False, print_step=0, replace_blank_na=True, var_kp = None)
score = sc.scorecard_ply(dat, card, only_total_score=False, print_step=0, replace_blank_na=True, var_kp = None)

In [None]:
# 展示评分卡
print('评分卡:',end='\n')
card

In [None]:
# 展示总体数据集评分
score

In [None]:
# 模型稳定性指标：PSI
sc.perf_psi(
  score = {'train':train_score, 'test':test_score},
  label = {'train':y_train, 'test':y_test}
)