### **Import Libraries**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

###**Read Data**

In [15]:
data = pd.read_csv("breast-cancer-data.csv")

### **EDA**

In [16]:
data.shape

(699, 11)

In [17]:
data.columns

Index(['id', 'clump_thickness', 'size_uniformity', 'shape_uniformity',
       'marginal_adhesion', 'epithelial_size', 'bare_nucleoli',
       'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class'],
      dtype='object')

In [18]:
data.head()

Unnamed: 0,id,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   id                 683 non-null    int64
 1   clump_thickness    683 non-null    int64
 2   size_uniformity    683 non-null    int64
 3   shape_uniformity   683 non-null    int64
 4   marginal_adhesion  683 non-null    int64
 5   epithelial_size    683 non-null    int64
 6   bare_nucleoli      683 non-null    int64
 7   bland_chromatin    683 non-null    int64
 8   normal_nucleoli    683 non-null    int64
 9   mitoses            683 non-null    int64
 10  class              683 non-null    int64
dtypes: int64(11)
memory usage: 64.0 KB


In [23]:
data = data[pd.to_numeric(data['bare_nucleoli'], errors = 'coerce').notnull()]

In [25]:
data['bare_nucleoli'] = data['bare_nucleoli'].astype(int)

In [27]:
data.describe()

Unnamed: 0,id,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,1076720.0,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,620644.0,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,63375.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,877617.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,1171795.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,1238705.0,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


###**Preprocessing**

In [30]:
X = data[['clump_thickness', 'size_uniformity', 'shape_uniformity','marginal_adhesion', 'epithelial_size', 'bare_nucleoli','bland_chromatin', 'normal_nucleoli', 'mitoses']].values
y = data.iloc[:, -1].values

In [31]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
scaled_X = std_scaler.fit_transform(X)

###**Model Building**

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [34]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state = 2, stratify = y)

In [36]:
model = SVC(kernel='linear', gamma='auto', C=0.3)
model.fit(X_train, y_train)

SVC(C=0.3, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

###**Prediction and Evaluation**

In [37]:
prediction = model.predict(X_test)

In [38]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           2       0.98      0.98      0.98        89
           4       0.96      0.96      0.96        48

    accuracy                           0.97       137
   macro avg       0.97      0.97      0.97       137
weighted avg       0.97      0.97      0.97       137



In [39]:
model.score(X_test, y_test)

0.9708029197080292

In [40]:
confusion_matrix(y_test,prediction)

array([[87,  2],
       [ 2, 46]])