In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

In [2]:
d = dict([("low", 0),("medium", 1),("high", 2)])
d

{'low': 0, 'medium': 1, 'high': 2}

In [4]:
#sl:satisfaction_level---False:MinMaxScaler;True:StandardScaler
#le:last_evaluation---False:MinMaxScaler;True:StandardScaler
#npr:number_project---False:MinMaxScaler;True:StandardScaler
#amh:average_monthly_hours--False:MinMaxScaler;True:StandardScaler
#tsc:time_spend_company--False:MinMaxScaler;True:StandardScaler
#wa:Work_accident--False:MinMaxScaler;True:StandardScaler
#pl5:promotion_last_5years--False:MinMaxScaler;True:StandardScaler
#dp:department--False:LabelEncoding;True:OneHotEncoding
#slr:salary--False:LabelEncoding;True:OneHotEncoding
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1):
    # 1.清洗数据
    df = pd.read_csv("./data/HR.csv")
    df = df.dropna(subset=["satisfaction_level", "last_evaluation"])
    df = df[df["satisfaction_level"]<=1][df["salary"]!="nme"]
    # 2.得到标注
    label = df["left"]
    df = df.drop("left", axis=1)
    # 3.特征选择：这里选择全部特征
    # 4.特征处理
    scaler_lst = [sl,le,npr,amh,tsc,wa,pl5]
    column_lst = ["satisfaction_level","last_evaluation","number_project",\
                "average_monthly_hours","time_spend_company","Work_accident",\
                "promotion_last_5years"]
    for i in range(len(scaler_lst)):
        # 传值为False：归一化
        if not scaler_lst[i]:
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    
    # 另外两个特征
    scaler_lst=[slr,dp]
    column_lst=["salary","department"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == "salary":
                df[column_lst[i]] = [map_salary(s) for s in df["salary"].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        
        else:
            df = pd.get_dummies(df, columns=[column_lst[i]])
            
    if lower_d:
        return PCA(n_components=ld_n).fit_transform(df.values), label
    
    return df,label

d = dict([("low", 0),("medium", 1),("high", 2)])        
def map_salary(s):
    return d.get(s,0)

def hr_modeling(features,label):
    from sklearn.model_selection import train_test_split
    f_v = features.values
    f_names = features.columns.values
    l_v = label.values
    X_tt, X_validation,Y_tt,Y_validation = train_test_split(f_v,l_v,test_size = 0.2)
    X_train,X_test,Y_train,Y_test = train_test_split(X_tt,Y_tt,test_size=0.25)
    
    # KNN
    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
    from sklearn.metrics import accuracy_score, recall_score, f1_score
    import joblib
    
    models = []
    models.append(("KNN", KNeighborsClassifier(n_neighbors=3)))
    
    for clf_name, clf in models:
        clf.fit(X_train, Y_train)
        xy_lst = [(X_train,Y_train),(X_validation, Y_validation),(X_test, Y_test)]
        for i in range(len(xy_lst)):
            X_part = xy_lst[i][0]
            Y_part = xy_lst[i][1]
            Y_pred = clf.predict(X_part)
            print(i)
            print(clf_name,"-ACC:", accuracy_score(Y_part, Y_pred))
            print(clf_name,"-REC:", recall_score(Y_part, Y_pred))
            print(clf_name,"-F1:", f1_score(Y_part, Y_pred))
    
#     knn_clf = KNeighborsClassifier(n_neighbors=5)
#     knn_clf.fit(X_train, Y_train)
#     Y_pred = knn_clf.predict(X_validation)
    
#     # 测试运行结果
#     from sklearn.metrics import accuracy_score, recall_score, f1_score
#     print("ACC:", accuracy_score(Y_validation, Y_pred))
#     print("REC:", recall_score(Y_validation, Y_pred))
#     print("F-Score:", f1_score(Y_validation, Y_pred))
    
#     # 保存运行结果
    
#     joblib.dump(knn_clf, "knn_clf")
    
#     # 获取保存结果
#     knn_clf = joblib.load("knn_clf")

def main():
    features, label = hr_preprocessing()
    hr_modeling(features,label)
    
main()



0
KNN -ACC: 0.9767751972441382
KNN -REC: 0.9652092148566056
KNN -F1: 0.9515643105446119
1
KNN -ACC: 0.948
KNN -REC: 0.9108910891089109
KNN -F1: 0.8919667590027701
2
KNN -ACC: 0.9553333333333334
KNN -REC: 0.9308005427408412
KNN -F1: 0.9110225763612217
