# Breast Cancer Wisconsin with SGD Logistic Regression and SVM

https://www.kaggle.com/datasets/priyanka841/breast-cancer-wisconsin?select=breast+cancer.csv

### Import Libraries

In [109]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor
from mlxtend.plotting import plot_decision_regions
from pandas_profiling import ProfileReport
from sklearn.metrics import *
from sklearn.svm import SVC

%matplotlib inline

### Read CSV file

In [59]:
cancer = pd.read_csv('breast cancer.csv')

### Descriptive analysis using ProfileReporter

In [60]:
profile = ProfileReport(cancer, title = 'cancer',correlations = None, interactions = None, html = {'style':{'full_width' : True}})
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



### Delete unnecessary variables

In [61]:
cancer.drop(columns=['Unnamed: 32','id'],axis=1,inplace=True)

### Transform Target Variable into boolean

In [62]:
cancer = pd.get_dummies(data=cancer, drop_first=True,columns=['diagnosis'])

### Define features and target variables

In [63]:
target = ['diagnosis_M']
features = [x for x in cancer.columns if x not in target]

### Normalize Data

In [64]:
cancer_norm = cancer.copy()

for i in features:    
    cancer_norm[i] = (cancer_norm[i] - cancer_norm[i].mean()) / cancer_norm[i].std()

### Specific check correlation with target¶

In [107]:
corrwith = pd.DataFrame({'corrwith':cancer_norm.corrwith(cancer_norm['diagnosis_M'])*100})
corrwith

Unnamed: 0,corrwith
radius_mean,73.002851
texture_mean,41.51853
perimeter_mean,74.263553
area_mean,70.898384
smoothness_mean,35.855997
compactness_mean,59.653368
concavity_mean,69.635971
concave points_mean,77.661384
symmetry_mean,33.049855
fractal_dimension_mean,-1.28376


### Check for Multicollinearity

In [66]:
vif_data = pd.DataFrame()
vif_data["feature"] = cancer_norm.columns

vif_data["VIF"] = [variance_inflation_factor(cancer_norm.values, i) for i in range(len(cancer_norm.columns))]
  
print(vif_data)

                    feature          VIF
0               radius_mean  3809.184240
1              texture_mean    11.886040
2            perimeter_mean  3788.134356
3                 area_mean   347.943847
4           smoothness_mean     8.194290
5          compactness_mean    50.764241
6            concavity_mean    70.832441
7       concave points_mean    60.077724
8             symmetry_mean     4.220698
9    fractal_dimension_mean    15.756978
10                radius_se    75.537838
11               texture_se     4.205496
12             perimeter_se    70.370498
13                  area_se    41.172282
14            smoothness_se     4.039730
15           compactness_se    15.366331
16             concavity_se    15.755193
17        concave points_se    11.542952
18              symmetry_se     5.176451
19     fractal_dimension_se     9.719850
20             radius_worst   803.743215
21            texture_worst    18.580056
22          perimeter_worst   405.058223
23              

### Delete variables with Multicollinearity

In [67]:
feature_clean_col = [x for x in features if x not in ['perimeter_mean','area_mean','perimeter_se','area_se',
                                                      'texture_worst','perimeter_worst','area_worst','compactness_mean',
                                                      'concavity_mean','concave points_mean','concavity_se',
                                                      'smoothness_worst','compactness_worst','concavity_worst',
                                                      'concave points_worst','symmetry_worst','fractal_dimension_worst',
                                                      'radius_worst','fractal_dimension_mean','compactness_se']]


### Check again for Multicollinearity

In [68]:
vif_data = pd.DataFrame()
vif_data["feature"] = cancer_norm[feature_clean_col].columns

vif_data["VIF"] = [variance_inflation_factor(cancer_norm[feature_clean_col].values, i) for i in range(len(cancer_norm[feature_clean_col].columns))]
  
print(vif_data)

                feature       VIF
0           radius_mean  3.142724
1          texture_mean  1.466896
2       smoothness_mean  1.746867
3         symmetry_mean  1.910700
4             radius_se  2.714998
5            texture_se  1.744077
6         smoothness_se  1.787206
7     concave points_se  2.425594
8           symmetry_se  1.775449
9  fractal_dimension_se  1.965055


### Split Train / Test

In [69]:
X_train, X_test, y_train, y_test = train_test_split(cancer[feature_clean_col], cancer[target], test_size = 0.3, random_state = 42 )

### Train SGD Model

In [99]:
sgd = SGDClassifier(loss="log_loss", alpha=0.01, max_iter=1000, random_state=42)
sgd.fit(X_train, y_train.values.ravel())

In [110]:
y_pred_sgd = sgd.predict(X_test)

In [114]:
print('SGDClassifier')
print('accuracy_score:  ', accuracy_score (y_pred= y_pred_sgd, y_true=y_test.values.ravel()))
print('precision_score: ', precision_score(y_pred= y_pred_sgd, y_true=y_test.values.ravel()))
print('recall:          ', recall_score   (y_pred= y_pred_sgd, y_true=y_test.values.ravel()))
print('F1:              ', f1_score       (y_pred= y_pred_sgd, y_true=y_test.values.ravel()))
print('AUC Score :      ', roc_auc_score  (y_score=y_pred_sgd, y_true=y_test.values.ravel()))

SGDClassifier
accuracy_score:   0.8947368421052632
precision_score:  1.0
recall:           0.7142857142857143
F1:               0.8333333333333333
AUC Score :       0.8571428571428572


### Train SVM model

In [113]:
SVM = SVC()
SVM.fit(X_train, y_train.values.ravel())
y_pred_svm= SVM.predict(X_test)

print('SVM')
print('accuracy_score:  ', accuracy_score (y_pred =y_pred_svm, y_true=y_test.values.ravel()))
print('precision_score: ', precision_score(y_pred =y_pred_svm, y_true=y_test.values.ravel()))
print('recall:          ', recall_score   (y_pred =y_pred_svm, y_true=y_test.values.ravel()))
print('F1:              ', f1_score       (y_pred =y_pred_svm, y_true=y_test.values.ravel()))
print('AUC Score :      ', roc_auc_score  (y_score=y_pred_svm, y_true=y_test.values.ravel()))

SVM
accuracy_score:   0.8947368421052632
precision_score:  0.9245283018867925
recall:           0.7777777777777778
F1:               0.8448275862068966
AUC Score :       0.8703703703703703
