In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [None]:
data = pd.read_csv('./iris.data')

In [None]:
data

In [None]:
data.hist(figsize=(16,14))

In [None]:
sns.pairplot(data,hue='class')

In [None]:
data.plot(kind='box',subplots=True,layout=(4,4),sharex=False,sharey=False,figsize=(16,14))

In [None]:
data['class']

In [None]:
# 定义类别到数值的映射字典
category_to_numeric = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
data['NumericCategory'] = data['class'].map(category_to_numeric)
data1 = data.drop(columns='class')##删除某一列

In [None]:
data1

In [None]:
corr = data.iloc[:,0:4].corr()
corr

In [None]:
plt.subplots(figsize=(10,5))
sns.heatmap(corr,annot=False)

# Feature Engening

In [None]:
x = data.iloc[:,0:4]
y = data['class']
x.describe()

In [None]:
select_top_4=SelectKBest(score_func=f_classif,k=2) ##建立一个函数，使用anova方法获得F值最高的2个特征

In [None]:
fit = select_top_4.fit(x,y)##使用fit方法，利用x和y数据，学习4个最好的特征
features=fit.transform(x)#返回学习到的4个最佳特征
used_index=fit.get_support(indices=True).tolist()#返回相关性排名前4的列
column_names=np.array(x.columns)#这三行是利用index信息，获得相关性排名前四的列
used_column=column_names[used_index]
print(used_column)

In [88]:
x_features= pd.DataFrame(data=features,columns=used_column)
x_features.head()

Unnamed: 0,petal length,petal width
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2


# Normalization
归一化有助于加快梯度下降求解最优的速度以及提高精度

In [92]:
rescaledx = StandardScaler().fit_transform(x_features) ## 通过sklearn的preprocessing数据预处理中standardscale特征缩放标准化特征信息
x = pd.DataFrame(data=rescaledx,columns=x_features.columns) ## 构建新特征dataframe
x.head()

Unnamed: 0,petal length,petal width
0,-1.341272,-1.312977
1,-1.341272,-1.312977
2,-1.398138,-1.312977
3,-1.284407,-1.312977
4,-1.341272,-1.312977


# 机器学习-构建二分类算法模型

In [94]:
#构建训练模型  传统机器学习算法
models=[]
models.append(('LR',LogisticRegression(max_iter=1000)))##逻辑回归
models.append(('NB',GaussianNB()))                     ##高斯朴素贝叶斯算法
models.append(('KNN',KNeighborsClassifier()))          ##K近邻分类
models.append(('DT',DecisionTreeClassifier()))         ##决策树分类
models.append(('SVM',SVC()))                           ##支持向量机分类

k-fold CV,也就是下面用到的函数KFold,是把原始的数据分为K个子集，每次将其中一个子集当作测试集，其余的作为训练集。深度学习一般数据量太大，不采用kfold的方法

In [95]:
results = []
names = []
for name,model in models:
    kfold = KFold(n_splits=10)
    cv_result=cross_val_score(model,x,y,cv=kfold,scoring='accuracy')##分成10份，分别应用模型学习，并计算准确性
    names.append(name)
    results.append(cv_result)

for i in range(len(names)):
    print(names[i],results[i].mean())                               ##10次结果的平均值

LR 0.9533333333333334
NB 0.9533333333333334
KNN 0.96
DT 0.9200000000000002
SVM 0.9533333333333334


# Without Feature engineering

In [98]:
x = data.iloc[:,0:4]
y = data['class']
x.describe()

Unnamed: 0,sepal length,sepal width,petal length,petal width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [99]:
results = []
names = []
for name,model in models:
    kfold = KFold(n_splits=10)
    cv_result=cross_val_score(model,x,y,cv=kfold,scoring='accuracy')##分成10份，分别应用模型学习，并计算准确性
    names.append(name)
    results.append(cv_result)

for i in range(len(names)):
    print(names[i],results[i].mean())                               ##10次结果的平均值

LR 0.9466666666666667
NB 0.9466666666666667
KNN 0.9333333333333333
DT 0.9333333333333333
SVM 0.9333333333333333
