In [2]:
import pandas as pd 
from sklearn.datasets import load_breast_cancer



In [3]:
cancer_data = load_breast_cancer(as_frame=True)
cancer_df = cancer_data.data
cancer_df['target'] = cancer_data.target

Exploratory data analysis

In [4]:
print(cancer_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [5]:
print(cancer_df.shape)

(569, 31)


In [6]:
# number of missing values 
print(cancer_df.isnull().sum())

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [7]:
# number of patients with a benign tumor and the number of patients with a malignant tumor.
print(cancer_df['target'].value_counts())

target
1    357
0    212
Name: count, dtype: int64


In [8]:
# summary statistics
print(cancer_df.describe())

       mean radius  mean texture  mean perimeter    mean area  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630          0.019380        0.000000             0.000000   
25%      

In [None]:
# Data preparation
from sklearn.model_selection import train_test_split

X = cancer_df.drop(['target'] , axis = 1)
y = cancer_df['target']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=417)

In [10]:
# Building and training a classifier
from sklearn.svm import LinearSVC

model = LinearSVC(penalty='l2',loss='hinge',C=10,random_state=417)
model.fit(X_train,y_train)



In [None]:
# Evaluating the model on test set 1
test_accuracy = model.score(X_test,y_test)
print(test_accuracy)

0.8255813953488372


In [14]:
# Fine-tuning the model
model = LinearSVC(penalty='l2',loss='squared_hinge',C=10,max_iter=3500,random_state=417)
model.fit(X_train,y_train)

In [15]:
# Evaluating the model on test set 2
test_accuracy = model.score(X_test,y_test)
print(test_accuracy)

0.9534883720930233
