# 使用交叉验证快速选择模型


In [2]:
# 加载数据集
import pandas as pd

### 代码开始 ### (≈ 2 行代码)
df = pd.read_csv("challenge-6-abalone.csv")
df.head()
### 代码结束 ###

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [3]:
# 数据集存在错误，进行修改
# 查看最后几行的数据
df.tail()

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
4172,M,0.59,0.44,0.135,0.966,0.439,0.2145,0.2605,10
4173,M,0.6,0.475,0.205,1.176,0.5255,0.2875,0.308,9
4174,F,0.625,0.485,0.15,1.0945,0.531,0.261,0.296,10
4175,M,0.71,0.555,0.195,1.9485,0.9455,0.3765,0.495,12
4176,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings


In [4]:
# 修正数据集
# 获取最后一行的值作为新的列名
columns_name = df.iloc[-1].values  # 简化索引方式

# 获取当前列名（即要转换为第一行的数据）
new_line = df.columns.values

# 删除最后一行（原来的列名行）
df = df.iloc[:-1]  # 更清晰的切片方式

# 设置新的列名
df.columns = columns_name

# 使用 concat 替代已弃用的 append
df = pd.concat([
    pd.DataFrame([new_line], columns=columns_name), 
    df
], ignore_index=True)
pd.concat([df.head(2), df.tail(2)])

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
4175,F,0.625,0.485,0.15,1.0945,0.531,0.261,0.296,10
4176,M,0.71,0.555,0.195,1.9485,0.9455,0.3765,0.495,12


In [5]:
# Exercise 23.1
# 将数据集目标值（Rings）按照区间替换为 3 种类别，并按照上文要求替换 Sex 列
df['Rings'] = pd.to_numeric(df['Rings'])
df['Rings'] = pd.cut(df.Rings, bins=[0, 10, 20, 30], labels=['small','middle','large'])
df['Sex'] = df.Sex.replace({'M':0, 'F':1, 'I':2})

print(df.iloc[[3, 6, 12, 83]]["Rings"].values)
df.head()

['small', 'middle', 'middle', 'large']
Categories (3, object): ['small' < 'middle' < 'large']


  df['Sex'] = df.Sex.replace({'M':0, 'F':1, 'I':2})


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,middle
1,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,small
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,small
3,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,small
4,2,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,small


In [8]:
# K折子集均分
'''
sklearn.model_selection.KFold(n_splits=3, shuffle=False, random_state=None)
    n_splits: 默认为3，最小为2，表示K折子集划分的K值
    shuffle: 默认为False，当为True时会对数据产生随机扰动
    random_state: 随机数种子
'''
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, random_state=50, shuffle=True)
# 直接运行查看结果
for train_index, test_index in kf.split(df):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))

TRAIN: 3759 TEST: 418
TRAIN: 3759 TEST: 418
TRAIN: 3759 TEST: 418
TRAIN: 3759 TEST: 418
TRAIN: 3759 TEST: 418
TRAIN: 3759 TEST: 418
TRAIN: 3759 TEST: 418
TRAIN: 3760 TEST: 417
TRAIN: 3760 TEST: 417
TRAIN: 3760 TEST: 417


In [9]:
# 交叉验证的方法
'''
sklearn.model_selection.cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch=‘2*n_jobs’)
    estimator: 模型
    x: 特征组成的数组
    y: 目标值组成的数组
    cv: K折数量
'''
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

features = df.iloc[:, 0:8]
target = df['Rings']

model = KNeighborsClassifier()
cross_val_score(model, X=features, y=target, cv=10)

array([0.75598086, 0.72009569, 0.77990431, 0.72966507, 0.73205742,
       0.74401914, 0.76315789, 0.74580336, 0.75779376, 0.73860911])

In [11]:
# Exercise 23.2 
# 使用 10 折交叉验证方法测试鲍鱼数据集在
# 逻辑回归、K 近邻、支持向量机、人工神经网络、决策树、随机森林、Adaboost 
# 默认参数下的表现结果，并取 10 折交叉验证结果取平均值
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

### 代码结束 ###

"""各分类模型 10 折交叉验证函数
"""
def classifiers():
    
    """
    参数:无

    返回:
    scores -- 各分类模型 10 折交叉验证平均结果（列表）
    """
    ### 代码开始 ### (> 10 行代码)
    scores = []
    
    models = [
        LogisticRegression(),
        KNeighborsClassifier(),
        SVC(),
        MLPClassifier(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        AdaBoostClassifier()]
    
    for model in models:
        score = cross_val_score(model, X=features, y=target, cv=10)
        mean_score = np.mean(score)
        scores.append(mean_score)
    ### 代码结束 ###
    
    return scores

classifiers()



[np.float64(0.7596278957695087),
 np.float64(0.746708661778711),
 np.float64(0.7574759331290948),
 np.float64(0.768248941516643),
 np.float64(0.6830143540669857),
 np.float64(0.7637034869711885),
 np.float64(0.7357044507934323)]