In [None]:
SVM case study

In [None]:
Target: 
-SVM model training
-Evaluation

In [None]:
Step1: Load the data

In [1]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
print(data.keys()) # dataset in details

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [2]:
print(data['DESCR'])
#check the description of data from sklearn datasets, 
#we have 30 attributes, 2 classes(negetive and possitive in general)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [3]:
print(data['target_names']) # 2 classes in total, binary classifier!

['malignant' 'benign']


In [4]:
print(data['feature_names']) # features

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [5]:
print(data['data'].shape) # 569 instance in total

(569, 30)


In [None]:
Step2: Data Visualization

In [6]:
import pandas as pd # we transfer data into pandas structure for visualization
import numpy as np
df_data = pd.DataFrame(np.c_[data['data'], data['target']], columns = np.append(data['feature_names'], ['target']))
df_data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0.0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0.0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0.0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0.0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0.0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0.0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0.0


In [7]:
df_data.corr() # check the corrections between features, the higher the better

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
mean radius,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,-0.311631,...,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066,-0.730029
mean texture,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,0.071401,-0.076437,...,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205,-0.415185
mean perimeter,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,-0.261477,...,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019,-0.742636
mean area,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,-0.28311,...,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738,-0.708984
mean smoothness,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,0.557775,0.584792,...,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316,-0.35856
mean compactness,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,0.602641,0.565369,...,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382,-0.596534
mean concavity,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,0.336783,...,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493,-0.69636
mean concave points,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,0.166917,...,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661,-0.776614
mean symmetry,0.147741,0.071401,0.183027,0.151293,0.557775,0.602641,0.500667,0.462497,1.0,0.479921,...,0.090651,0.219169,0.177193,0.426675,0.4732,0.433721,0.430297,0.699826,0.438413,-0.330499
mean fractal dimension,-0.311631,-0.076437,-0.261477,-0.28311,0.584792,0.565369,0.336783,0.166917,0.479921,1.0,...,-0.051269,-0.205151,-0.231854,0.504942,0.458798,0.346234,0.175325,0.334019,0.767297,0.012838


In [None]:
Step3: Apply the SVM model

In [8]:
#Reorgnize the data
x = pd.DataFrame(data['data'], columns = data['feature_names'])
x

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [9]:
y = df_data['target']

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42) 
# 80% data for training, 20% for testing. random seed value is 42
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(455, 30)
(455,)
(114, 30)
(114,)


In [11]:
from sklearn.svm import SVC 
#hard margin svm
svm_linear_clf = SVC(C=1000, kernel='linear')
svm_linear_clf.fit(x_train, y_train)

In [12]:
#rbf kernel svm
svm_rbf_clf = SVC(kernel = 'rbf', gamma=0.1)
svm_rbf_clf.fit(x_train, y_train)

In [None]:
Step4: Evaluation

In [13]:
from sklearn.metrics import classification_report, confusion_matrix
svm_linear_clf_y_predict = svm_linear_clf.predict(x_test)
svm_linear_clf_cm = confusion_matrix(y_test, svm_linear_clf_y_predict)

In [14]:
print(classification_report(y_test, svm_linear_clf_y_predict))

              precision    recall  f1-score   support

         0.0       0.97      0.91      0.94        43
         1.0       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [15]:
svm_rbf_clf_y_predict = svm_rbf_clf.predict(x_test)

svm_rbf_clf_cm = confusion_matrix(y_test, svm_rbf_clf_y_predict)

print(classification_report(y_test, svm_rbf_clf_y_predict))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        43
         1.0       0.62      1.00      0.77        71

    accuracy                           0.62       114
   macro avg       0.31      0.50      0.38       114
weighted avg       0.39      0.62      0.48       114



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
Step5: Improvement 

In [16]:
# let's look at attributes, the values are relatively high, so scale the data would be an option now
# normolization : scaled_data = (x - x.mean()) / x.max() - x.min()
x_mean = x_train.mean()
x_mean

mean radius                 14.117635
mean texture                19.185033
mean perimeter              91.882242
mean area                  654.377582
mean smoothness              0.095744
mean compactness             0.103619
mean concavity               0.088898
mean concave points          0.048280
mean symmetry                0.181099
mean fractal dimension       0.062757
radius error                 0.402016
texture error                1.202687
perimeter error              2.858253
area error                  40.071299
smoothness error             0.006989
compactness error            0.025635
concavity error              0.032824
concave points error         0.011894
symmetry error               0.020574
fractal dimension error      0.003820
worst radius                16.235103
worst texture               25.535692
worst perimeter            107.103121
worst area                 876.987033
worst smoothness             0.131532
worst compactness            0.252742
worst concav

In [17]:
x_max = x_train.max()
x_max

mean radius                  28.11000
mean texture                 39.28000
mean perimeter              188.50000
mean area                  2501.00000
mean smoothness               0.16340
mean compactness              0.31140
mean concavity                0.42680
mean concave points           0.20120
mean symmetry                 0.30400
mean fractal dimension        0.09744
radius error                  2.87300
texture error                 4.88500
perimeter error              21.98000
area error                  542.20000
smoothness error              0.03113
compactness error             0.13540
concavity error               0.39600
concave points error          0.05279
symmetry error                0.06146
fractal dimension error       0.02984
worst radius                 36.04000
worst texture                49.54000
worst perimeter             251.20000
worst area                 4254.00000
worst smoothness              0.21840
worst compactness             0.93790
worst concav

In [18]:
x_min = x_train.min()
x_min

mean radius                  7.691000
mean texture                 9.710000
mean perimeter              47.920000
mean area                  170.400000
mean smoothness              0.052630
mean compactness             0.019380
mean concavity               0.000000
mean concave points          0.000000
mean symmetry                0.116700
mean fractal dimension       0.049960
radius error                 0.111500
texture error                0.360200
perimeter error              0.757000
area error                   6.802000
smoothness error             0.001713
compactness error            0.002252
concavity error              0.000000
concave points error         0.000000
symmetry error               0.007882
fractal dimension error      0.000895
worst radius                 8.678000
worst texture               12.020000
worst perimeter             54.490000
worst area                 223.600000
worst smoothness             0.071170
worst compactness            0.027290
worst concav

In [19]:
x_train_scaled = (x_train - x_mean) / (x_max - x_min)
x_train_scaled

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
68,-0.249211,-0.062734,-0.235398,-0.173293,0.098005,0.129035,0.525075,-0.022514,0.160178,0.372857,...,-0.216545,-0.076911,-0.211495,-0.137030,0.113210,0.201797,0.780675,0.208996,0.260788,0.285115
181,0.341465,0.249745,0.361486,0.281740,0.165713,0.614960,0.374419,0.503579,0.311806,0.236378,...,0.381730,0.211735,0.352788,0.300718,0.119323,0.555296,0.322209,0.605216,0.235162,0.377519
63,-0.242159,-0.180082,-0.232481,-0.168831,-0.167320,-0.055165,-0.067990,-0.131610,0.282976,0.144761,...,-0.227509,-0.168062,-0.211037,-0.140653,-0.225308,-0.093280,-0.107743,-0.217568,0.074311,0.008750
248,-0.169824,0.204091,-0.169813,-0.131888,0.007457,-0.107114,-0.152550,-0.159691,0.045923,0.011231,...,-0.145644,0.257311,-0.148051,-0.104527,0.124756,-0.124029,-0.129469,-0.181520,0.099345,-0.020328
60,-0.193331,-0.145588,-0.194425,-0.146948,0.159393,-0.078794,-0.182892,-0.175844,0.497604,0.144129,...,-0.190597,-0.215504,-0.189330,-0.126138,-0.027387,-0.169207,-0.202008,-0.303753,0.128519,-0.031094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,-0.256116,-0.153704,-0.235398,-0.176082,0.018832,0.169443,-0.006650,-0.097216,0.048592,0.569571,...,-0.237633,-0.262945,-0.226441,-0.147029,-0.073573,-0.010039,-0.104788,-0.227911,-0.128331,0.207970
106,-0.121340,-0.028916,-0.118881,-0.103783,0.166615,-0.006573,-0.042639,-0.066749,-0.005332,0.051458,...,-0.113117,0.099262,-0.109771,-0.088152,0.253127,0.014560,0.010148,0.026178,-0.019519,0.060208
270,0.008441,-0.079981,-0.011255,-0.009344,-0.283958,-0.263233,-0.191303,-0.208896,-0.161766,-0.189485,...,-0.048429,-0.130216,-0.064375,-0.047734,-0.311500,-0.222249,-0.188446,-0.277843,-0.088118,-0.192166
435,-0.006741,0.014710,-0.005422,-0.023547,0.092588,0.033151,0.055534,0.081263,-0.075807,0.056513,...,0.029417,0.140307,0.034553,-0.001907,0.202186,0.114273,0.105675,0.235456,0.054007,0.183386


In [20]:
x_test_scaled = (x_test - x_mean) / (x_max - x_min)
x_test_scaled

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
204,-0.080691,-0.019785,-0.076769,-0.074006,0.035262,0.007468,-0.020731,-0.050049,0.060872,0.020498,...,-0.046236,-0.023872,-0.056190,-0.049396,0.075174,-0.016409,-0.005986,-0.043582,0.021482,0.030791
70,0.236170,0.071862,0.225621,0.204077,-0.051043,-0.002463,0.044756,0.155219,-0.122257,-0.171583,...,0.315214,0.027833,0.298901,0.245388,-0.083082,-0.021021,-0.004708,0.222398,-0.069786,-0.152406
131,0.065741,0.009975,0.069838,0.040557,0.121477,0.063971,0.135196,0.161979,0.064075,-0.101027,...,0.110551,0.012375,0.090473,0.069227,0.156679,-0.014651,0.083471,0.127896,-0.013409,-0.031179
431,-0.084119,-0.050897,-0.074066,-0.080056,0.087171,0.095818,-0.026917,-0.100844,0.000007,0.174036,...,-0.122619,-0.069981,-0.088928,-0.089616,0.091475,0.011155,-0.027392,-0.139114,-0.068800,0.082419
540,-0.126237,-0.160468,-0.122580,-0.107903,0.036977,0.028699,-0.050441,-0.111033,0.003744,0.106639,...,-0.145278,-0.156069,-0.143984,-0.104006,0.020158,-0.044961,-0.075794,-0.154647,-0.113547,-0.021430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,0.025582,-0.078966,0.016558,0.004987,-0.084265,-0.125469,-0.086640,-0.101242,-0.214622,-0.193908,...,0.008219,-0.002550,-0.005608,-0.011410,-0.117721,-0.050232,-0.024676,-0.123375,-0.088709,-0.151813
75,0.095615,0.015724,0.086910,0.070077,-0.036689,-0.066363,0.020599,0.089961,-0.006934,-0.186326,...,0.129190,-0.026005,0.110299,0.085851,0.125435,-0.052977,0.006634,0.129958,-0.050270,-0.169531
249,-0.127217,-0.143897,-0.128128,-0.106444,0.050158,-0.087457,-0.106884,-0.094383,0.038448,-0.022678,...,-0.131025,-0.115823,-0.133309,-0.095570,0.050043,-0.103822,-0.075235,-0.062207,-0.047511,-0.048981
238,0.005013,0.293032,0.004750,-0.013077,-0.122001,0.000961,0.050145,-0.020874,-0.250393,-0.030892,...,-0.017729,0.399902,-0.023401,-0.028034,-0.159153,-0.011137,0.025404,-0.109939,-0.200083,-0.050083


In [21]:
scaled_svm_clf = SVC(C=1000, kernel='linear')
scaled_svm_clf.fit(x_train_scaled, y_train)
predict1 = scaled_svm_clf.predict(x_test_scaled)
print(classification_report(y_test, predict1))

              precision    recall  f1-score   support

         0.0       0.93      0.95      0.94        43
         1.0       0.97      0.96      0.96        71

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114



In [22]:
clf = SVC(kernel = 'rbf', gamma=0.1)
clf.fit(x_train_scaled, y_train)
print(classification_report(y_test, clf.predict(x_test_scaled)))

              precision    recall  f1-score   support

         0.0       0.97      0.91      0.94        43
         1.0       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [None]:
Step6: Best parameters!
#in order to choose the best parameter, we use cv and gridsearch to search for the target

In [30]:
params = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.1, 1, 10, 100, 1000], 'kernel': ['rbf', 'linear', 'poly', 'sigmoid']} 

In [32]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,cv = 10,scoring='neg_mean_squared_error') # cross validation

In [33]:
grid.fit(x_train_scaled,y_train)

In [39]:
print(grid.best_params_)


{'C': 100, 'gamma': 0.1, 'kernel': 'sigmoid'}


In [40]:
print(grid.best_estimator_)

SVC(C=100, gamma=0.1, kernel='sigmoid')


In [41]:
grid_predictions = grid.predict(x_test_scaled)

In [42]:
print(classification_report(y_test,grid_predictions))

              precision    recall  f1-score   support

         0.0       0.98      0.95      0.96        43
         1.0       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

