In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
plt.xkcd()

In [None]:
print(os.listdir("../input/")) #展示所有文件的地址

In [None]:
train_applic=pd.read_csv("../input/application_train.csv")
test_applic=pd.read_csv("../input/application_test.csv")

In [None]:
train_applic.head()

In [None]:
test_applic.head()

In [None]:
train_applic.shape

In [None]:
test_applic.shape

先对比一下训练数据和测试的shape

### 数据可视化分析探究 Exploratory data analysis 

In [None]:
train_applic["TARGET"].value_counts() #将目标数量统计处理

In [None]:
train_applic["TARGET"].plot.hist() #hist是将目标的频率统计出来

In [None]:
train_applic["TARGET"].astype(int).plot.hist() #hist是将目标的频率统计出来

In [None]:
train_applic["TARGET"].astype(int).value_counts()

从可视化的结果来看，目标数量是非常imbalance的。处理imbalance 数据
有两种方法：
- 1.cost function based approaches 
- 2.sampling based approach 
   -  oversampling 
   - undersampling 
   - hybrid(SMOTE)


> ## 处理数据的流程
- 读取数据
- 数据可视化（分类问题看看目标数量是否不均衡）
-分析目标
- 检查特征是否有缺失值
- 填充缺失值

In [None]:
#定义函数，用来查找数据是否存在缺失值
def find_missing_value(df):
    missing_value=df.isnull().sum() #计缺失值个数
    missing_value_percent=missing_value*100/len(df) #计算百分比
#     missing_value_percent=df.isnull().sum()*100/len(df) #计算百分比
    #将缺失值以及百分比放到一个相同的表格中用来统计
    missing_value_table=pd.concat([missing_value,missing_value_percent],axis=1)  
    missing_value_table=missing_value_table[missing_value_table.iloc[:,1]!=0].sort_values(by=[1],ascending=False)
    

    return missing_value_table

In [None]:
missing_values=find_missing_value(train_applic)
missing_values.head(20)

- 将训练数据集中的缺失值都找出来了。
- 找到缺失值后，需要将其填补。
    - 填补缺失值的方式： sklearn.preprocessing.Imputer(missing_values='Nan',straregy="mean",axis=0)    straregy:mean/median/most_frequent

## 训练数据中，特征的种类调查


In [None]:
train_applic.dtypes.value_counts() #查找有多少种类

object 代表的是类别种类，属于字符串形式。需要特殊处理才能被模型处理。
float int 代表数值型，好处理。

In [None]:
train_applic.select_dtypes("object").apply(pd.Series.nunique,axis=0)
#计算objects中每列有多少类。nunique返回不同值

## 根据类别进行编码。
- 编码方法：
    - one-label-encoding 适合两类
    - one-hot-encoding  适合多类 但是会增加维度 常用pca一起降维

In [None]:
'''
pandas.Series.nunique() return number of unique elements in the object
pandas.Series.unique() return unique values of Series object
'''

olecon=LabelEncoder()
label_count=0

for col in train_applic:
    if train_applic[col].dtype=="object":
        if len(list(train_applic[col].unique()))<=2:
            olecon.fit(train_applic[col])
            train_applic[col]=olecon.transform(train_applic[col])
            test_applic[col]=olecon.transform(test_applic[col])
            label_count +=1
print(label_count)
        

In [None]:
# one-hot-label 
train_applic=pd.get_dummies(train_applic)
test_applic=pd.get_dummies(test_applic)

# 使用get-dummies 将类型变量变成数值变量

In [None]:
train_applic.head(5)

In [None]:
train_applic.dtypes.value_counts()

In [None]:
train_applic.shape

In [None]:
test_applic.shape

# 经过编码之后，行列的值相差很大了，因此需要将其对其校准。

In [None]:
train_labels=train_applic["TARGET"]

In [None]:
train_applic,test_applic=train_applic.align(test_applic,join="inner",axis=1)


In [None]:
train_applic.shape

In [None]:
train_applic.head(2)

In [None]:
test_applic.shape

In [None]:
test_applic.head(2)

In [None]:
train_applic["TARGET"]=train_labels

In [None]:
train_applic.head(2)

In [None]:
print(train_applic['DAYS_BIRTH'].head(),train_applic['DAYS_EMPLOYED'].head())  #生日是负数

In [None]:
print(train_applic['DAYS_ID_PUBLISH'].head(),train_applic['DAYS_REGISTRATION'].head())  #生日是负数

In [None]:
(train_applic["DAYS_BIRTH"]/-365).describe()

In [None]:
train_applic["DAYS_EMPLOYED"].head()

In [None]:
train_applic["DAYS_EMPLOYED"].describe() #最大超过100年

In [None]:
train_applic['DAYS_EMPLOYED'].plot.hist(title="工作时间")
plt.xlabel("days emplyment")

In [None]:
max_days_emp=train_applic[train_applic["DAYS_EMPLOYED"]==365243]
exclu_max_days_emp=train_applic[train_applic["DAYS_EMPLOYED"]!=365243]
exclu_max_days_emp["DAYS_EMPLOYED"].plot.hist()
#去掉最大值似乎好很多，同时最大值也是异常值的表现

In [None]:
train_applic['DAYS_EMPLOYED'].replace({365243:np.nan},inplace=True)

In [None]:
train_applic['DAYS_EMPLOYED'].plot.hist()

In [None]:
test_applic['DAYS_EMPLOYED'].plot.hist()

In [None]:
test_applic['DAYS_EMPLOYED'].replace({365243:np.nan},inplace=True)

In [None]:
test_applic['DAYS_EMPLOYED'].plot.hist()

# 处理完标签和一出值之后，现在来进行相关性探索

In [None]:
train_applic.shape

In [None]:
test_applic.shape

In [None]:
# train_applic.isnull().sum().sort_values(ascending=False)

In [None]:
# test_applic.isnull().sum().sort_values(ascending=False)

In [None]:
correlations=train_applic.corr()['TARGET'].sort_values(ascending=False)

In [None]:
correlations.head(15)

和目标相关性很大的几个变量：
-  days-birth
- days-emplyed
- region-rating-client
- incoming 
..

In [None]:
train_applic['DAYS_BIRTH']=abs(train_applic['DAYS_BIRTH'])

In [None]:
train_applic['DAYS_BIRTH'].corr(train_applic['TARGET'])

In [None]:
train_applic['TARGET'].corr(train_applic['DAYS_BIRTH'])

年龄与目标是互相关。年轻越小，目标越大越靠近1.说明年龄越小，越不能偿还。

In [None]:
train_applic[['TARGET','DAYS_BIRTH']].head()

In [None]:
extra_data=train_applic[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']]
extra_data_corrs=extra_data.corr()

In [None]:
extra_data_corrs  #相关性矩阵

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(extra_data_corrs)
plt.title("correlation heatmap")

# 特征工程
- 多项式特征
- 专业知识构成特征

In [None]:
poly_features_train=train_applic[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH','TARGET']];
poly_features_test=test_applic[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]

In [None]:
poly_features_train.head()

In [None]:
poly_features_test.head()

In [None]:
from sklearn.preprocessing import Imputer
imputer=Imputer(strategy='median')
poly_traget=poly_features_train['TARGET']
poly_features_train=poly_features_train.drop(columns=['TARGET'])

poly_features_train=imputer.fit_transform(poly_features_train)
poly_features_test=imputer.fit_transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures

poly_transformer = PolynomialFeatures(degree=2)

In [None]:
poly_transformer.fit(poly_features_train)
poly_features_train=poly_transformer.transform(poly_features_train)
poly_features_test=poly_transformer.transform(poly_features_test)


In [None]:
poly_features_train.shape

In [None]:
poly_features_test.shape

In [None]:
poly_transformer.get_feature_names(input_features=['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'])[:16]

In [None]:
poly_features_train=pd.DataFrame(poly_features_train,columns=poly_transformer.get_feature_names(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']))
poly_features_train['TARGET']=poly_traget

poly_corrs=poly_features_train.corr()['TARGET'].sort_values()

In [None]:
poly_corrs

In [None]:
poly_features_test=pd.DataFrame(poly_features_test,columns=poly_transformer.get_feature_names(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']))

In [None]:
poly_features_train.shape

In [None]:
poly_features_test.shape

In [None]:
poly_features_train['SK_ID_CURR']=train_applic['SK_ID_CURR']
app_poly_features_train=train_applic.merge(poly_features_train,on="SK_ID_CURR",how='left')


In [None]:
poly_features_test['SK_ID_CURR']=test_applic['SK_ID_CURR']
app_poly_features_test=test_applic.merge(poly_features_test,on="SK_ID_CURR",how='left')



In [None]:
app_poly_features_train.shape

In [None]:
app_poly_features_test.shape

In [None]:
app_poly_features_train,app_poly_features_test=app_poly_features_train.align(app_poly_features_test,join='inner',axis=1)

In [None]:
app_poly_features_train.shape

In [None]:
app_poly_features_test.shape

In [None]:
app_poly_features_train.head()

In [None]:
app_poly_features_test.head()

In [None]:
train_applic_poly=app_poly_features_train
test_applic_poly=app_poly_features_test

In [None]:
train_applic_poly['TARGET']=poly_traget

In [None]:
train_applic_poly.head()

# 最后检查数据

In [None]:
# train_applic_poly.isnull().sum()

In [None]:
# train_applic_poly.dtypes.value_counts()

In [None]:
# test_applic_poly.shape

In [None]:
# test_applic_poly.dtypes.value_counts()

In [None]:
# train_applic_poly.isnull().sum().sort_values(ascending=False)

In [None]:
train_applic_poly.head()

## 特别说明：
    1.所有的字符特征都进行了编码
    2.但是不是所有的null都进行了填充
     3.要是将所有的null 填满，用imputer的话 会得到narray。将narry转成dataframe，columns没法改变。
     4.用fillna()更方便

In [None]:
# train_applic_poly=imputer.fit_transform(train_applic_poly)
# test_applic_poly=imputer.fit_transform(test_applic_poly)

In [None]:
# train_applic_poly.head()


In [None]:
# test_applic_poly.shape

In [None]:
train_applic_poly.fillna(train_applic_poly.median(),inplace=True)

In [None]:
train_applic_poly.head()

In [None]:
test_applic_poly.fillna(test_applic_poly.median(),inplace=True)

In [None]:
train_applic_poly.shape

In [None]:
test_applic_poly.shape

## “使用模型！
    1.逻辑回归
    2.随机森林
    3.决策树
    4.支持向量机

    1.逻辑回归

In [None]:
    from sklearn.preprocessing import MinMaxScaler,Imputer

In [None]:
# train=train_applic_poly.copy()
# test=test_applic_poly.copy()

In [None]:
'''
归一化：
1.只对特征进行归一化
2.不对目标进行归一化
'''
taeget=train_applic_poly['TARGET']
train=train_applic_poly.drop(columns=['TARGET'])
test=test_applic_poly.copy()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
scaler=MinMaxScaler(feature_range=(0,1)) #归一化
scaler.fit(train)
train=scaler.transform(train)
test=scaler.transform(test)



In [None]:
train.shape

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg=LogisticRegression(C=0.0001)
log_reg.fit(train,train_labels)  #二分类时候真实label没有放进去

In [None]:
# log_reg_pred=log_reg.predict_proba(test)

In [None]:
log_reg_pred=log_reg.predict_proba(test)[:,1]

#train test 都变成了narry

In [None]:
submit=test_applic_poly[['SK_ID_CURR']]
submit['TARGET']=log_reg_pred

In [None]:
submit.to_csv('log_reg_baseline.csv',index=False)

# 随机森林

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest=RandomForestClassifier(n_estimators=100,random_state=50,verbose=1,n_jobs=-1)


In [None]:
random_forest.fit(train,train_labels)

In [None]:
train_labels

In [None]:
important_feature_values=random_forest.feature_importances_
features=list(train_applic_poly.drop(columns=['TARGET']))
feature_importances=pd.DataFrame({'feature':features,'importance':important_feature_values})

predictions=random_forest.predict_proba(test)[:,1]

In [None]:
submit=test_applic_poly[['SK_ID_CURR']]
submit['TARGET']=predictions
submit.to_csv('random_forest_baseline.csv',index=False)

In [None]:
feature_importances.sort_values(by='importance',ascending=False)

In [None]:
feature_importances.plot.bar()