In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.svm import SVC,LinearSVC
from sklearn import metrics

pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
data.info()
data.head()

mean 代表平均值，se 代表标准差，worst 代表最大值，后30个特征值实际是10个特征值（radius、texture、perimeter、area、smoothness、compactness、concavity、concave points、symmetry和fractal_dimension_mean）的平均值、标准差和最大值。

查看数据的基本情况：可以看到各字段数据没有缺失

In [None]:
data.keys()

“id”没有实际意思，可以去掉

In [None]:
data.drop('id',axis = 1,inplace=True)

“diagnosis”字段的取值即分类结果为B或M，可以用0和1来替代

In [None]:
data['diagnosis'] = data['diagnosis'].map({'M':1,'B':0})

In [None]:
features_mean,features_se,features_worst =[],[],[]
for feat in data.columns:
    if "mean" in feat:
        features_mean.append(feat)
    elif "se" in feat:
        features_se.append(feat)
    elif "worst" in feat:
        features_worst.append(feat)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.countplot(data['diagnosis'],label = 'Count')

对三类特征分组（_mean、_se、_wrost）：
对三组特征分别可视化相关系数（热力图）：

In [None]:
mean_corr = data[features_mean].corr()
plt.figure(figsize=(10,10))
sns.heatmap(mean_corr,annot=True)
plt.show()

In [None]:
se_corr = data[features_se].corr()
plt.figure(figsize=(10,10))
sns.heatmap(se_corr,annot=True)
plt.show()

In [None]:
worst_corr = data[features_worst].corr()
plt.figure(figsize=(10,10))
sns.heatmap(worst_corr,annot=True)
plt.show()

特征选择：
通过热力图可以看出，这三组变量的模式/关系实际上是很接近的，我们就选取第一组mean的特征进行分析，根据相关系数从10个特征中又可以筛选出其中没那么相关的6个特征，以实现维度规约。

In [None]:
features_remain = ['radius_mean','texture_mean', 'smoothness_mean','compactness_mean','symmetry_mean', 'fractal_dimension_mean'] 

提取目标变量并编码

In [None]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(data,test_size = 0.3)  #抽取30%的数据作为测试集，其余作为训练集
train_X = train[features_remain]   #抽取特征选择的数值作为训练和测试数据
train_y = train['diagnosis']
test_X = test[features_remain]
test_y = test['diagnosis']

模型训练和评估，将数据分为训练数据和测试数据，比例为7:3

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()               #采用Z-Score标准化，保证每个特征维度的数据均值为0，方差为1
train_X = ss.fit_transform(train_X)
test_X = ss.transform(test_X)

In [None]:
from sklearn import svm
from sklearn import metrics
model1 = svm.SVC(kernel='linear')    #创建SVM分类器
model1.fit(train_X,train_y)          #用训练集做训练
prediction = model1.predict(test_X)  #用测试集做预测
print('Accurancy:',metrics.accuracy_score(prediction,test_y))
print(metrics.classification_report(model1.predict(test_X),test_y))

In [None]:
from sklearn import svm
from sklearn import metrics
model2 = svm.SVC(kernel='sigmoid')   #创建SVM分类器
model2.fit(train_X,train_y)          #用训练集做训练
prediction = model2.predict(test_X)  #用测试集做预测
print('Accurancy:',metrics.accuracy_score(prediction,test_y))
print(metrics.classification_report(model2.predict(test_X),test_y))

In [None]:
from sklearn import svm
from sklearn import metrics
model3 = svm.SVC(kernel='rbf',C=1)       #创建SVM分类器
model3.fit(train_X,train_y)          #用训练集做训练
prediction = model3.predict(test_X)  #用测试集做预测
print('Accurancy:',metrics.accuracy_score(prediction,test_y))
print(metrics.classification_report(model3.predict(test_X),test_y))

In [None]:
from sklearn import svm
from sklearn import metrics
model3 = svm.SVC(kernel='rbf',C=0.1)       #创建SVM分类器
model3.fit(train_X,train_y)          #用训练集做训练
prediction = model3.predict(test_X)  #用测试集做预测
print('Accurancy:',metrics.accuracy_score(prediction,test_y))
print(metrics.classification_report(model3.predict(test_X),test_y))

In [None]:
from sklearn import svm
from sklearn import metrics
model3 = svm.SVC(kernel='rbf',C=10)       #创建SVM分类器
model3.fit(train_X,train_y)          #用训练集做训练
prediction = model3.predict(test_X)  #用测试集做预测
print('Accurancy:',metrics.accuracy_score(prediction,test_y))
print(metrics.classification_report(model3.predict(test_X),test_y))