# SVM classification with Cancer datasset
- 2020.05.15

## 1. import packages

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

## 2. Load dataset

In [8]:
dataset = pd.read_csv('cancer.csv')

# Explore dataset
dataset.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,benign
0,18.0,10.4,123.0,1000.0,0.118,0.278,0.3,0.147,0.242,0.0787,...,17.3,185.0,2020.0,0.162,0.666,0.712,0.265,0.46,0.119,0.0
1,20.6,17.8,133.0,1330.0,0.0847,0.0786,0.0869,0.0702,0.181,0.0567,...,23.4,159.0,1960.0,0.124,0.187,0.242,0.186,0.275,0.089,0.0
2,19.7,21.3,130.0,1200.0,0.11,0.16,0.197,0.128,0.207,0.06,...,25.5,153.0,1710.0,0.144,0.424,0.45,0.243,0.361,0.0876,0.0
3,11.4,20.4,77.6,386.0,0.142,0.284,0.241,0.105,0.26,0.0974,...,26.5,98.9,568.0,0.21,0.866,0.687,0.258,0.664,0.173,0.0
4,20.3,14.3,135.0,1300.0,0.1,0.133,0.198,0.104,0.181,0.0588,...,16.7,152.0,1580.0,0.137,0.205,0.4,0.163,0.236,0.0768,0.0


In [9]:
print("The shape of data :", dataset.shape)

The shape of data : (569, 31)


## 3. Make Dataset

In [3]:
X = dataset.iloc[:, 0:30]
Y = dataset.iloc[:, 30]

In [5]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,18.0,10.4,123.0,1000.0,0.118,0.278,0.3,0.147,0.242,0.0787,...,25.4,17.3,185.0,2020.0,0.162,0.666,0.712,0.265,0.46,0.119
1,20.6,17.8,133.0,1330.0,0.0847,0.0786,0.0869,0.0702,0.181,0.0567,...,25.0,23.4,159.0,1960.0,0.124,0.187,0.242,0.186,0.275,0.089
2,19.7,21.3,130.0,1200.0,0.11,0.16,0.197,0.128,0.207,0.06,...,23.6,25.5,153.0,1710.0,0.144,0.424,0.45,0.243,0.361,0.0876
3,11.4,20.4,77.6,386.0,0.142,0.284,0.241,0.105,0.26,0.0974,...,14.9,26.5,98.9,568.0,0.21,0.866,0.687,0.258,0.664,0.173
4,20.3,14.3,135.0,1300.0,0.1,0.133,0.198,0.104,0.181,0.0588,...,22.5,16.7,152.0,1580.0,0.137,0.205,0.4,0.163,0.236,0.0768


In [6]:
Y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: benign, dtype: float64

In [10]:
print("The shape of X :", X.shape)
print("The shape of Y :", Y.shape)

The shape of X : (569, 30)
The shape of Y : (569,)


In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify = Y, random_state = 0)

In [15]:
# Check
print(y_train.shape)
print(y_test.shape)

(426,)
(143,)


## 4. SVM

In [17]:
svc = SVC()
svc.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## 5. Accuracy Analysis

In [20]:
print("prediction :", svc.predict(x_test))
print("train accuracy :", svc.score(x_train, y_train))
print("test accuracy :", svc.score(x_test, y_test))

prediction : [1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1.
 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0.
 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0.]
train accuracy : 0.9225352112676056
test accuracy : 0.916083916083916


## 6. Underfit? (high bias)

In [21]:
svc1 = SVC(C = 1000)  # variance up!
svc1.fit(x_train, y_train)

SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [22]:
print("prediction :", svc1.predict(x_test))
print("train accuracy :", svc1.score(x_train, y_train))
print("test accuracy :", svc1.score(x_test, y_test))

prediction : [1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1.
 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0.
 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0.]
train accuracy : 0.9624413145539906
test accuracy : 0.9440559440559441


In [37]:
svc2 = SVC(C = 1000, gamma = 0.00001)  # variance up!
svc2.fit(x_train, y_train)

SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1e-05, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [38]:
print("prediction :", svc2.predict(x_test))
print("train accuracy :", svc2.score(x_train, y_train))
print("test accuracy :", svc2.score(x_test, y_test))

prediction : [1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1.
 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0.
 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0.]
train accuracy : 0.9835680751173709
test accuracy : 0.958041958041958
