## HR analysis
### 标注是left，离散值，所以属于分类问题
基于对HR.csv的特征探索分析基础，我们这里直接建模

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as ss
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# 验证数据的准确率，召回率，F值
from sklearn.metrics import accuracy_score, recall_score, f1_score
%matplotlib inline

### 特征处理

In [2]:
def hr_preprocessing(sl=False, le=False, npr=False, amh=False, tsc=False, wa=False, 
                     pl=False, dep=False, sal=False,lower_d=False, ld_n=1):
    def map_salary(s):
        d = dict([('low', 0), ('medium', 1), ('high', 2)])
        return d.get(s, 0)
    
    df = pd.read_csv("HR.csv")
    
    # 清洗数据
    df = df.dropna(subset=['satisfaction_level', 'last_evaluation'])
    df = df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 标注
    label = df['left']
    df=df.drop('left', axis=1)
    # 特征选择
    # 特征处理
    scaler_lst = [sl, le, npr, amh, tsc, wa, pl]
    column_lst = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', 
                  'time_spend_company', 'Work_accident', 'promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]
        else:
            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]
    # 对离散值处理
    scaler_lst2 = [sal, dep]
    column_lst2 = ['salary', 'department']
    for i in range(len(scaler_lst2)):
        if not scaler_lst2[i]:
            if column_lst2[i] == 'salary':
                df[column_lst2[i]] = [map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst2[i]] = LabelEncoder().fit_transform(df[column_lst2[i]].values.reshape(-1, 1))
            # 统一归一化处理
            df[column_lst2[i]] = MinMaxScaler().fit_transform(df[column_lst2[i]].values.reshape(-1, 1))
        else:
            df = pd.get_dummies(df, columns=[column_lst2[i]])
    if lower_d:
        # 如果为True，使用PCA降维
        return PCA(n_components=ld_n).fit_transform(df.values), label
    return df, label

### 建模

In [3]:
def hr_modeling(features, label):
    from sklearn.model_selection import train_test_split
    f_v = features.values
    l_v = label.values
    # 训练集，验证集，Y为标注
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    # 从训练集再切割0.25为测试集
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    print(len(X_train), len(X_validation), len(X_test))
features, label = hr_preprocessing()
hr_modeling(features, label)

8999 3000 3000


  y = column_or_1d(y, warn=True)


训练集，验证集，测试集长度比为 3：1：1，符合预期

In [4]:
def hr_modeling(features, label):
    from sklearn.model_selection import train_test_split
    f_v = features.values
    l_v = label.values
    # 训练集，验证集，Y为标注
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    # 从训练集再切割0.25为测试集
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
    # n_neighbors=3 结果最好
    knn_clf = KNeighborsClassifier(n_neighbors=3)
    knn_clf.fit(X_train, Y_train)
    Y_predict = knn_clf.predict(X_validation)
    # 比较实际验证集和预测值
    print("ACC:", accuracy_score(Y_validation, Y_predict))
    print("REC:", recall_score(Y_validation, Y_predict))
    print("F-Score:", f1_score(Y_validation, Y_predict))
features, label = hr_preprocessing()
hr_modeling(features, label)    

ACC: 0.9516666666666667
REC: 0.9127423822714681
F-Score: 0.9008885850991114


  y = column_or_1d(y, warn=True)


### 接下来多尝试几个分类器

In [5]:
def hr_modeling(features, label):
    from sklearn.model_selection import train_test_split
    f_v = features.values
    l_v = label.values
    # 训练集，验证集，Y为标注
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    # 从训练集再切割0.25为测试集
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
    models = []
    models.append(('KNN', KNeighborsClassifier(n_neighbors=3)))
    for clf_name, clf in models:
        clf.fit(X_train, Y_train)
        xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]
        for i in range(len(xy_lst)):
            X_part = xy_lst[i][0]
            Y_part = xy_lst[i][1]
            Y_predict = clf.predict(X_part)
            print(i)
            print(clf_name, ' ACC:',accuracy_score(Y_part, Y_predict))
            print(clf_name, ' REC:', recall_score(Y_part, Y_predict))
            print(clf_name, ' F:',f1_score(Y_part, Y_predict))

features, label = hr_preprocessing()
hr_modeling(features, label) 

  y = column_or_1d(y, warn=True)


0
KNN  ACC: 0.9738859873319258
KNN  REC: 0.9598893499308437
KNN  F: 0.9465787679017958
1
KNN  ACC: 0.954
KNN  REC: 0.9337175792507204
KNN  F: 0.9037656903765692
2
KNN  ACC: 0.9563333333333334
KNN  REC: 0.9519774011299436
KNN  F: 0.9114266396213658


0为训练集，1为验证集，2为测试集分别对应的准确率，召回率，F值。接下来我们把所有尝试的分类器写到一个函数里

In [6]:
def hr_modeling(features, label):
    from sklearn.model_selection import train_test_split
    f_v = features.values
    l_v = label.values
    # 训练集，验证集，Y为标注
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    # 从训练集再切割0.25为测试集
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier # KNN
    from sklearn.naive_bayes import GaussianNB, BernoulliNB # 高斯，泊努力  朴素贝叶斯(对离散值比较适合，这里也试试)
    from sklearn.tree import DecisionTreeClassifier # 决策树、
    from sklearn.svm import SVC # 支持向量机
    from sklearn.ensemble import RandomForestClassifier # 随机森林
    from sklearn.ensemble import AdaBoostClassifier # 集成
    from sklearn.linear_model import LogisticRegression # 逻辑斯特回归(这里数据线性不可分，效果也不会理想，适合线性可分情况)
    models = []
    models.append(('KNN', KNeighborsClassifier(n_neighbors=3)))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('DecisionTree', DecisionTreeClassifier()))
    models.append(('SVM', SVC(C=100))) # 惩罚度，数值越大，运算越谨慎，时间越长
    models.append(('RandomForest', RandomForestClassifier()))
    models.append(('AdaBoost', AdaBoostClassifier()))
    models.append(('LogisticRegression', LogisticRegression(C=1000, solver='sag')))
    for clf_name, clf in models:
        clf.fit(X_train, Y_train)
        xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]
        for i in range(len(xy_lst)):
            X_part = xy_lst[i][0]
            Y_part = xy_lst[i][1]
            Y_predict = clf.predict(X_part)
            print(i)
            print(clf_name, ' ACC:',accuracy_score(Y_part, Y_predict))
            print(clf_name, ' REC:', recall_score(Y_part, Y_predict))
            print(clf_name, ' F:',f1_score(Y_part, Y_predict))

features, label = hr_preprocessing()
hr_modeling(features, label) 

  y = column_or_1d(y, warn=True)


0
KNN  ACC: 0.9747749749972219
KNN  REC: 0.9561567164179104
KNN  F: 0.9475387104229258
1
KNN  ACC: 0.953
KNN  REC: 0.9208333333333333
KNN  F: 0.9038854805725972
2
KNN  ACC: 0.9513333333333334
KNN  REC: 0.9151343705799151
KNN  F: 0.898611111111111
0
GaussianNB  ACC: 0.7968663184798311
GaussianNB  REC: 0.7252798507462687
GaussianNB  F: 0.6298096395301741
1
GaussianNB  ACC: 0.8116666666666666
GaussianNB  REC: 0.7486111111111111
GaussianNB  F: 0.6561168594035302
2
GaussianNB  ACC: 0.8013333333333333
GaussianNB  REC: 0.743988684582744
GaussianNB  F: 0.6383495145631067
0
BernoulliNB  ACC: 0.8386487387487499
BernoulliNB  REC: 0.4608208955223881
BernoulliNB  F: 0.5764294049008168
1
BernoulliNB  ACC: 0.846
BernoulliNB  REC: 0.47638888888888886
BernoulliNB  F: 0.5975609756097561
2
BernoulliNB  ACC: 0.8463333333333334
BernoulliNB  REC: 0.4837340876944837
BernoulliNB  F: 0.5973799126637555
0
DecisionTree  ACC: 1.0
DecisionTree  REC: 1.0
DecisionTree  F: 1.0
1
DecisionTree  ACC: 0.975
DecisionTree 



0
SVM  ACC: 0.9523280364484943
SVM  REC: 0.902518656716418
SVM  F: 0.9002093510118633
1
SVM  ACC: 0.9513333333333334
SVM  REC: 0.9
SVM  F: 0.8987517337031901
2
SVM  ACC: 0.9586666666666667
SVM  REC: 0.9137199434229137
SVM  F: 0.9124293785310734
0
RandomForest  ACC: 0.9978886542949217
RandomForest  REC: 0.9916044776119403
RandomForest  F: 0.9955513931163662
1
RandomForest  ACC: 0.988
RandomForest  REC: 0.9583333333333334
RandomForest  F: 0.9745762711864409
2
RandomForest  ACC: 0.9853333333333333
RandomForest  REC: 0.9462517680339463
RandomForest  F: 0.9681620839363243




0
AdaBoost  ACC: 0.962884764973886
AdaBoost  REC: 0.9183768656716418
AdaBoost  F: 0.9218164794007491
1
AdaBoost  ACC: 0.9633333333333334
AdaBoost  REC: 0.9083333333333333
AdaBoost  F: 0.922425952045134
2
AdaBoost  ACC: 0.9596666666666667
AdaBoost  REC: 0.9151343705799151
AdaBoost  F: 0.914487632508834
0
LogisticRegression  ACC: 0.7918657628625403
LogisticRegression  REC: 0.3516791044776119
LogisticRegression  F: 0.44602188701567574
1
LogisticRegression  ACC: 0.7886666666666666
LogisticRegression  REC: 0.3527777777777778
LogisticRegression  F: 0.44483362521891423
2
LogisticRegression  ACC: 0.7826666666666666
LogisticRegression  REC: 0.3437057991513437
LogisticRegression  F: 0.42706502636203864


从结果看出，这样的数据情况下，随机森林和决策树是表现比较好的，然后是KNN。

剩下的就是对选定模型调惨优化了