# 1、读取数据

In [2]:
import pandas as pd

data = pd.read_csv(filepath_or_buffer="./Predict Hair Fall.csv")

# 查看形状
print(data.shape)
# 查看数据集的总结性信息
print(data.info())

(999, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Id                        999 non-null    int64 
 1   Genetics                  999 non-null    object
 2   Hormonal Changes          999 non-null    object
 3   Medical Conditions        999 non-null    object
 4   Medications & Treatments  999 non-null    object
 5   Nutritional Deficiencies  999 non-null    object
 6   Stress                    999 non-null    object
 7   Age                       999 non-null    int64 
 8   Poor Hair Care Habits     999 non-null    object
 9   Environmental Factors     999 non-null    object
 10  Smoking                   999 non-null    object
 11  Weight Loss               999 non-null    object
 12  Hair Loss                 999 non-null    int64 
dtypes: int64(3), object(10)
memory usage: 101.6+ KB
None


# 2、清洗数据

In [4]:
# 处理重复值
print(data.duplicated(subset=["Id"]).sum())
data.drop_duplicates(subset=["Id"], inplace=True)

4


In [5]:
# 处理缺失值
print(data.isna().sum())
data.dropna(how="any", inplace=True)

Id                          0
Genetics                    0
Hormonal Changes            0
Medical Conditions          0
Medications & Treatments    0
Nutritional Deficiencies    0
Stress                      0
Age                         0
Poor Hair Care Habits       0
Environmental Factors       0
Smoking                     0
Weight Loss                 0
Hair Loss                   0
dtype: int64


In [6]:
# 处理异常值
print(((data["Age"] < 0) | (data["Age"] > 150)).sum())
data = data[~(data["Age"] < 0) | (data["Age"] > 150)]

0


In [7]:
# 处理无用列
data.drop(columns=["Id"], inplace=True)

In [8]:
# 查看清洗之后的数据集总结性信息
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 995 entries, 0 to 998
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Genetics                  995 non-null    object
 1   Hormonal Changes          995 non-null    object
 2   Medical Conditions        995 non-null    object
 3   Medications & Treatments  995 non-null    object
 4   Nutritional Deficiencies  995 non-null    object
 5   Stress                    995 non-null    object
 6   Age                       995 non-null    int64 
 7   Poor Hair Care Habits     995 non-null    object
 8   Environmental Factors     995 non-null    object
 9   Smoking                   995 non-null    object
 10  Weight Loss               995 non-null    object
 11  Hair Loss                 995 non-null    int64 
dtypes: int64(2), object(10)
memory usage: 101.1+ KB
None


# 3、编码

In [10]:
# 查看编码前的数据内容
data

Unnamed: 0,Genetics,Hormonal Changes,Medical Conditions,Medications & Treatments,Nutritional Deficiencies,Stress,Age,Poor Hair Care Habits,Environmental Factors,Smoking,Weight Loss,Hair Loss
0,Yes,No,No Data,No Data,Magnesiumdeficiency,Moderate,19,Yes,Yes,No,No,0
1,No,No,Eczema,Antibiotics,Magnesiumdeficiency,High,43,Yes,Yes,No,No,0
2,No,No,Dermatosis,AntifungalCream,Proteindeficiency,Moderate,26,Yes,Yes,No,Yes,0
3,Yes,Yes,Ringworm,Antibiotics,BiotinDeficiency,Moderate,46,Yes,Yes,No,No,0
4,No,No,Psoriasis,Accutane,Irondeficiency,Moderate,30,No,Yes,Yes,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...
994,Yes,No,SeborrheicDermatitis,Rogaine,VitaminADeficiency,Low,33,Yes,Yes,Yes,Yes,1
995,Yes,Yes,No Data,Accutane,Proteindeficiency,Low,47,No,No,No,Yes,0
996,No,Yes,AndrogeneticAlopecia,Antidepressants,Proteindeficiency,Moderate,20,Yes,No,Yes,Yes,1
997,No,Yes,Dermatitis,Immunomodulators,BiotinDeficiency,Moderate,32,Yes,Yes,Yes,Yes,1


In [11]:
"""
    通过上面数据内容的简单查看，可得知“Age”和“Hair Loss”两个字段是离散值，不需要做编码
    其他字段（包括值为Yes、No的）均可以进行编码，方便后面进行科学计算
    这些需要编码的字段列表为：["Genetics", "Hormonal Changes", "Medical Conditions", "Medications & Treatments", "Nutritional Deficiencies", "Stress", "Poor Hair Care Habits", 
                            "Environmental Factors", "Smoking", "Weight Loss"]
"""

# 构建需要进行编码的字段列表
need_code_column_list = ["Genetics", "Hormonal Changes", "Medical Conditions", "Medications & Treatments", "Nutritional Deficiencies", "Stress", "Poor Hair Care Habits", 
               "Environmental Factors", "Smoking", "Weight Loss"]
column_code_dicts = {}

# 用循环的方式来进行编码，提高代码的复用性
for need_code_column in need_code_column_list:
    print(f"""当前编码的字段为：{need_code_column}, 该字段的选项值为：{data[need_code_column].unique()}""")
    column_code_dicts[need_code_column] = {value: idx for idx, value in enumerate(data[need_code_column].unique())}
    data[need_code_column] = data[need_code_column].apply(func=lambda ele: column_code_dicts[need_code_column][ele])

当前编码的字段为：Genetics, 该字段的选项值为：['Yes' 'No']
当前编码的字段为：Hormonal Changes, 该字段的选项值为：['No' 'Yes']
当前编码的字段为：Medical Conditions, 该字段的选项值为：['No Data' 'Eczema' 'Dermatosis' 'Ringworm' 'Psoriasis' 'AlopeciaAreata'
 'ScalpInfection' 'SeborrheicDermatitis' 'Dermatitis' 'ThyroidProblems'
 'AndrogeneticAlopecia']
当前编码的字段为：Medications & Treatments, 该字段的选项值为：['No Data' 'Antibiotics' 'AntifungalCream' 'Accutane' 'Chemotherapy'
 'Steroids' 'Rogaine' 'BloodPressureMedication' 'Immunomodulators'
 'Antidepressants' 'HeartMedication']
当前编码的字段为：Nutritional Deficiencies, 该字段的选项值为：['Magnesiumdeficiency' 'Proteindeficiency' 'BiotinDeficiency'
 'Irondeficiency' 'Seleniumdeficiency' 'Omega-3fattyacids'
 'ZincDeficiency' 'VitaminADeficiency' 'VitaminDDeficiency' 'No Data'
 'VitaminEdeficiency']
当前编码的字段为：Stress, 该字段的选项值为：['Moderate' 'High' 'Low']
当前编码的字段为：Poor Hair Care Habits, 该字段的选项值为：['Yes' 'No']
当前编码的字段为：Environmental Factors, 该字段的选项值为：['Yes' 'No']
当前编码的字段为：Smoking, 该字段的选项值为：['No' 'Yes']
当前编码的字段为：Weight Loss, 该字段的

In [12]:
# 查看编码后的数据内容
data

Unnamed: 0,Genetics,Hormonal Changes,Medical Conditions,Medications & Treatments,Nutritional Deficiencies,Stress,Age,Poor Hair Care Habits,Environmental Factors,Smoking,Weight Loss,Hair Loss
0,0,0,0,0,0,0,19,0,0,0,0,0
1,1,0,1,1,0,1,43,0,0,0,0,0
2,1,0,2,2,1,0,26,0,0,0,1,0
3,0,1,3,1,2,0,46,0,0,0,0,0
4,1,0,4,3,3,0,30,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
994,0,0,7,6,7,2,33,0,0,1,1,1
995,0,1,0,3,1,2,47,1,1,0,1,0
996,1,1,10,9,1,0,20,0,1,1,1,1
997,1,1,8,8,2,0,32,0,0,1,1,1


# 4、切分

In [14]:
y = data["Hair Loss"].to_numpy()
X = data.drop(columns=["Hair Loss"]).to_numpy()
 
# 查看X、y的行数和列数
print(X.shape, y.shape)
 
# 引入train_test_split，将X、y切分为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(995, 11) (995,)


# 5、标准化

In [16]:
mu = X_train.mean(axis=0)
sigma = X_train.std(axis=0)
X_train = (X_train - mu) / sigma
X_test = (X_test - mu) / sigma

# 6、数据保存

In [18]:
import joblib
# 创建state_dict列表，存放上面做过编码的字典数据以及训练集的mu、sigma值
state_dict = [column_code_dicts, mu, sigma]
# 创建data列表，存放训练集和测试集数据
data = [X_train, y_train, X_test, y_test]
# 使用dump方法，将state_dict和data存入all_data.joblib文件中
joblib.dump(value=[state_dict, data], filename="all_data.joblib")

['all_data.joblib']

# 7、加载数据

In [20]:
import joblib
# 之前是将state_dict和data保存至all_data.joblib文件中，现在是从all_data.joblib文件中取出state_dict和data
[state_dict, data] = joblib.load(filename="all_data.joblib")
column_code_dicts, mu, sigma = state_dict
X_train, y_train, X_test, y_test = data

# 8、算法评估

## 8.1 KNN算法

In [23]:
from sklearn.neighbors import KNeighborsClassifier
 
# KNN算法的准确率和邻居数有关
# 这里我们先初始化邻居数、准确率以及模型，后面再通过条件判断做对比，找到准确率最佳时的邻居数，以及对应的模型与准确率
best_n_neighbor = 0
best_acc = 0
best_model = None
 
# 邻居数的取值是有讲究的
# （1）不易过小，过小则会对异常值非常敏感
# （2）不易过大，过大则会使模型过于复杂，难以泛化到未见过的数据
# 大多数情况下，n_neighbors 的取值范围在 3 到 30 之间。这个范围内的值通常足以捕捉数据的局部特性，同时避免过度拟合
for n_neighbor in range(3, 31, 1):
    knn = KNeighborsClassifier(n_neighbors=n_neighbor)
    knn.fit(X=X_train, y=y_train)
    y_pred = knn.predict(X=X_test)
    acc = (y_pred == y_test).mean()
    # 逐个比较不同邻居数下的准确率，直到得到最大的准确率，并获取对应的邻居数、准确率以及模型实例
    if acc > best_acc:
        best_n_neighbor = n_neighbor
        best_acc = acc
        best_model = knn
        print(f"找到了一个更好的模型: {best_n_neighbor}, {best_acc}")
# 保存循环判断所得的最佳模型
joblib.dump(value=[best_n_neighbor, best_acc, best_model], filename="best_knn.joblib")

找到了一个更好的模型: 3, 0.5376884422110553
找到了一个更好的模型: 4, 0.5477386934673367


['best_knn.joblib']

## 8.2贝叶斯算法 

In [25]:
from sklearn.naive_bayes import GaussianNB

# 初始化
best_acc = 0
best_model = None

gnb = GaussianNB()
gnb.fit(X=X_train, y=y_train)
y_pred = gnb.predict(X=X_test)
acc = (y_pred == y_test).mean()
print(acc)
 
# 高斯贝叶斯算法没有其他影响准确率的参数，所以最佳准确率和模型即为直接算出的准确率和模型，不需要做比较判断
best_acc = acc
best_model = gnb
 
# 保存模型
joblib.dump(value=[_, best_acc, best_model], filename="best_gnb.joblib")

0.5025125628140703


['best_gnb.joblib']

## 8.3 决策树算法

In [27]:
from sklearn.tree import DecisionTreeClassifier
 
# 决策树算法的准确率和衡量数据集混乱程度的指标有关（一般是基尼系数和熵这两种指标）
criterions = ["gini", "entropy"]
# 这里我们先初始化衡量数据集混乱程度的指标、准确率以及模型，后面再通过条件判断做对比，找到准确率最佳时的指标，以及对应的模型与准确率
best_criterion = None
best_acc = 0
best_model = None
 
for criterion in criterions:
    dtc = DecisionTreeClassifier(criterion=criterion)
    dtc.fit(X=X_train, y=y_train)
    y_pred = dtc.predict(X=X_test)
    acc = (y_pred == y_test).mean()
    # 逐个比较不同指标下的准确率，直到得到最大的准确率，并获取对应的邻居数、准确率以及模型实例
    if acc > best_acc:
        best_criterion = criterion
        best_acc = acc
        best_model = dtc
        print(f"找到了一个更好的模型: {best_criterion}, {best_acc}")
# 保存循环判断所得的最佳模型
joblib.dump(value=[best_criterion, best_acc, best_model], filename="best_dtc.joblib")

找到了一个更好的模型: gini, 0.5778894472361809
找到了一个更好的模型: entropy, 0.5979899497487438


['best_dtc.joblib']

## 8.4 Kmeans算法

In [29]:
from sklearn.cluster import KMeans

# 初始化
best_acc = 0
best_model = None

km = KMeans(n_clusters=2, random_state=0)
# 由于KMeans是无监督学习算法，所以训练时只需要传入X，通过特征来寻找标签
km.fit(X=X_train)
y_pred = gnb.predict(X=X_test)
acc = (y_pred == y_test).mean()
print(acc)
 
# Kmeans没有其他影响准确率的参数，所以最佳准确率和模型即为直接算出的准确率和模型，不需要做比较判断
best_acc = acc
best_model = km
 
# 保存模型
joblib.dump(value=[_, best_acc, best_model], filename="best_km.joblib")

0.5025125628140703




['best_km.joblib']

## 8.5 逻辑回归算法

In [31]:
from sklearn.linear_model import LogisticRegression

# 初始化
best_params = None
best_acc = 0
best_model = None

lg = LogisticRegression(max_iter=1000)
lg.fit(X=X_train, y=y_train)

y_pred = lg.predict(X=X_test)
acc = (y_pred == y_test).mean()
print(f"预测的准确率为:{acc}")

# 计算权重和偏置项
w = lg.coef_
b = lg.intercept_

# 逻辑回归算法没有其他影响准确率的参数，所以最佳准确率和模型即为直接算出的准确率和模型，不需要做比较判断
best_params = [w, b]
best_acc = acc
best_model = lg

# 保存模型
joblib.dump(value=[best_params, best_acc, best_model], filename="best_lg.joblib")

预测的准确率为:0.48743718592964824


['best_lg.joblib']

## 8.6 支持向量机算法

In [33]:
from sklearn.svm import SVC

# 初始化
best_kernel = None
best_acc = 0
best_model = None

# kernel可选项列表
kernels = ["linear", "poly", "rbf", "sigmoid"]

for kernel in kernels:
    svc = SVC(kernel=kernel, C=1.0, random_state=0)
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X=X_test)
    acc = (y_pred == y_test).mean()
    # 逐个比较不同指标下的准确率，直到得到最大的准确率，并获取对应的邻居数、准确率以及模型实例
    if acc > best_acc:
        best_kernel = kernel
        best_acc = acc
        best_model = svc
        print(f"找到了一个更好的模型: {best_kernel}, {best_acc}")

# 保存模型
joblib.dump(value=[best_kernel, best_acc, best_model], filename="best_svc.joblib")

找到了一个更好的模型: linear, 0.49246231155778897
找到了一个更好的模型: poly, 0.49748743718592964


['best_svc.joblib']

## 8.7 随机森林算法

In [35]:
from sklearn.ensemble import RandomForestClassifier

# 初始化
best_n_estimators = None
best_acc = 0
best_model = None

n_estimators_list = [100, 1000, 10000]

for n_estimators in n_estimators_list:
    rfc = RandomForestClassifier(n_estimators= n_estimators, random_state=42)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X=X_test)
    acc = (y_pred == y_test).mean()
    # 逐个比较不同指标下的准确率，直到得到最大的准确率，并获取对应的邻居数、准确率以及模型实例
    if acc > best_acc:
        best_n_estimators = n_estimators
        best_acc = acc
        best_model = rfc
        print(f"找到了一个更好的模型: {best_n_estimators}, {best_acc}")

# 保存模型
joblib.dump(value=[best_n_estimators, best_acc, best_model], filename="best_rfc.joblib")

找到了一个更好的模型: 100, 0.49748743718592964


['best_rfc.joblib']

## 8.8 XGBoost算法

In [37]:
from xgboost import XGBClassifier

# 初始化
best_n_estimators = None
best_acc = 0
best_model = None

n_estimators_list = [100, 1000, 10000]

for n_estimators in n_estimators_list:
    xgbc = XGBClassifier(n_estimators=n_estimators, learning_rate=0.1, random_state=42)
    xgbc.fit(X_train, y_train)
    y_pred = xgbc.predict(X=X_test)
    acc = (y_pred == y_test).mean()
    # 逐个比较不同指标下的准确率，直到得到最大的准确率，并获取对应的邻居数、准确率以及模型实例
    if acc > best_acc:
        best_n_estimators = n_estimators
        best_acc = acc
        best_model = xgbc
        print(f"找到了一个更好的模型: {best_n_estimators}, {best_acc}")

# 保存模型
joblib.dump(value=[best_n_estimators, best_acc, best_model], filename="best_xgbc.joblib")

找到了一个更好的模型: 100, 0.48743718592964824
找到了一个更好的模型: 1000, 0.49246231155778897


['best_xgbc.joblib']

## 8.9 lightgbm算法

In [39]:
# 引入lightgbm算法的实现类
from lightgbm import LGBMClassifier

# 初始化
best_n_estimators = None
best_acc = 0
best_model = None

n_estimators_list = [100, 1000, 10000]

for n_estimators in n_estimators_list:
    lgbmc = LGBMClassifier(n_estimators=n_estimators, learning_rate=0.1, random_state=42)
    lgbmc.fit(X_train, y_train)
    y_pred = lgbmc.predict(X=X_test)
    acc = (y_pred == y_test).mean()
    # 逐个比较不同指标下的准确率，直到得到最大的准确率，并获取对应的邻居数、准确率以及模型实例
    if acc > best_acc:
        best_n_estimators = n_estimators
        best_acc = acc
        best_model = lgbmc
        print(f"找到了一个更好的模型: {best_n_estimators}, {best_acc}")
print(f"找到了一个最好的模型: {best_n_estimators}, {best_acc}")
# 保存模型
joblib.dump(value=[best_n_estimators, best_acc, best_model], filename="best_lgbmc.joblib")

[LightGBM] [Info] Number of positive: 386, number of negative: 410
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 796, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.484925 -> initscore=-0.060320
[LightGBM] [Info] Start training from score -0.060320
找到了一个更好的模型: 100, 0.47738693467336685
[LightGBM] [Info] Number of positive: 386, number of negative: 410
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 796, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.484925 -> i

['best_lgbmc.joblib']

# 9、特征的重要性

In [41]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X=X_train, y=y_train)

# 获取各特征属性的重要性，并赋值给feature_importances_
feature_importances_ = dtc.feature_importances_
# 对重要性进行从小到大的排序
feature_importances_.sort()
# 获取最重要的5个特征对应的索引
selected_feature_idxes = feature_importances_.argsort()[::-1][:5]
 
# 分别从训练集和测试集中根据对应的索引取出最重要的5个特征对应的数据集
X_train1 = X_train[:, selected_feature_idxes]
X_test1 = X_test[:, selected_feature_idxes]
# 根据最重要的5个特征数据集，进行模型训练，求得准确率（发现准确率确实有一些变化）
dtc1 = DecisionTreeClassifier()
dtc1.fit(X=X_train1, y=y_train)
y_pred1 = dtc1.predict(X=X_test1)
acc1 = (y_pred1 == y_test).mean()
print(f"选择最重要的5个特征之后，预测的准确率为:{acc1}")
dtc1 = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
# 接下来，复用上面代码，根据最重要的1、2、3、4、5、6、...、11个特征对应的数据集进行遍历计算，看看哪个的准确率最高，从而判断哪几个特征是最重要的
best_acc = 0
best_num_features = 0
best_selected_feature_idxes = []
for num_features in range(1, 12, 1):
    selected_feature_idxes = feature_importances_.argsort()[::-1][:num_features]
    X_train1 = X_train[:, selected_feature_idxes]
    X_test1 = X_test[:, selected_feature_idxes]
    dtc1.fit(X=X_train1, y=y_train)
    y_pred1 = dtc1.predict(X=X_test1)
    acc1 = (y_pred1 == y_test).mean()
    if acc1 > best_acc:
        best_acc = acc1
        best_num_features = num_features
        best_selected_feature_idxes = selected_feature_idxes
print(f"选择最重要的{best_num_features}个特征之后，可得最大的准确率为:{best_acc}")
 
# 引入pandas库，提供一些数据分析的方法
import pandas as pd
# 读取《信用卡客户流失数据集.csv》文件中的数据
data = pd.read_csv(filepath_or_buffer="./Predict Hair Fall.csv")
# 通过keys方法获得最重要的特征字段名称
print(f"这{best_num_features}个特征对应的特征字段以及其重要性从打大小排序为:{data.keys()[best_selected_feature_idxes]}")

选择最重要的5个特征之后，预测的准确率为:0.4723618090452261
选择最重要的5个特征之后，可得最大的准确率为:0.5226130653266332
这5个特征对应的特征字段以及其重要性从打大小排序为:Index(['Smoking', 'Environmental Factors', 'Poor Hair Care Habits', 'Age',
       'Stress'],
      dtype='object')
