# Breast Cancer Classification using Machine Learning
### 
## Import Dependencies


In [39]:
import numpy as np
import pandas as pd
import sklearn.datasets

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier


In [41]:
from matplotlib import pyplot as plt
# It allows matplotlib plots to be displayed directly in the notebook output, without the need to call plt.show() explicitly.
%matplotlib inline
from sklearn.metrics import accuracy_score


## Data Collection and preprocess

### Loard dataset from sklearn 

In [42]:
breast_cancer_dataset = sklearn.datasets.load_breast_cancer()

In [6]:
print(breast_cancer_dataset)

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
 

In [43]:
df = pd.DataFrame(breast_cancer_dataset.data, columns= breast_cancer_dataset.feature_names)

In [44]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [54]:
# Add target Column to the dataframe
df['label']= breast_cancer_dataset.target

In [45]:
df.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


In [46]:
df.shape

(569, 30)

#### There are 569 rows and 31 columns in this dataset

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [51]:
df.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64

#### There are no missing values and datatype is float and last column is integer

In [48]:
# Statistical information about the dataset
df.describe(include='all')

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [55]:
# Check how many benign and malignant cases so checking distribution of target variable
df['label'].value_counts()


label
1    357
0    212
Name: count, dtype: int64

### 0 represents malignant so there are 212 cases belong to this
### 1 represents benign, 357 cases belong to this

In [56]:
df.groupby('label').mean()

Unnamed: 0_level_0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,0.06268,...,21.134811,29.318208,141.37033,1422.286321,0.144845,0.374824,0.450606,0.182237,0.323468,0.09153
1,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,0.062867,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442


### Seperate features

In [57]:
x = df.drop(columns='label', axis=1)
y=df['label']

In [18]:
print(x)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

In [58]:
print(y)

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: label, Length: 569, dtype: int64


In [59]:
# Split for training and testing
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=2)

In [60]:
print(x.shape,x_train.shape, x_test.shape,)

(569, 30) (455, 30) (114, 30)


In [62]:
# Normalization
sc = StandardScaler()
X_train_sc = sc.fit_transform(x_train)
X_test_sc = sc.fit_transform(x_test)

In [63]:
#model Training
from sklearn.model_selection import train_test_split, KFold,GridSearchCV
model = LogisticRegression(max_iter=1000)

#Tuning parameters
parameters = {'C':[0.001,0.01, 0.1, 0.2,0.3, 1,10,100,200]}
grid_search = GridSearchCV(model, parameters, cv = 10) # 10 Fold Cross Validation

# Model fitting
grid_search.fit(X_train_sc, y_train)


print("Best Score is ", grid_search.best_score_)
print("Best Estimator is ", grid_search.best_estimator_)
print("Best Parametes are", grid_search.best_params_)

Best Score is  0.980144927536232
Best Estimator is  LogisticRegression(C=0.1, max_iter=1000)
Best Parametes are {'C': 0.1}


In [64]:
model = LogisticRegression(C = 0.1)
model.fit(X_train_sc, y_train)
y_pred_lr = model.predict(X_test_sc)
accuracy_lr = accuracy_score(y_test,y_pred_lr)
print("Accuracy on Test Data:",accuracy_lr)

Accuracy on Test Data: 0.956140350877193


In [65]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       1.00      0.89      0.94        45
           1       0.93      1.00      0.97        69

    accuracy                           0.96       114
   macro avg       0.97      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114



In [66]:
confusion_matrix(y_test,y_pred_lr)
lr_cm = confusion_matrix(y_test, y_pred_lr)
lr_cm = pd.DataFrame(lr_cm, columns=['Benign', 'Malignant'], index=['Benign','Malignant'])
lr_cm

Unnamed: 0,Benign,Malignant
Benign,40,5
Malignant,0,69


### Model training
### Logistic Regression

In [67]:
model = LogisticRegression()

In [68]:
print(model)

LogisticRegression()


In [69]:
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Model Evaluation

### Accuracy Score

In [70]:
# Accuracy on training data
x_train_prediction = model.predict(x_train)
train_data_accuracy = accuracy_score(y_train,x_train_prediction)

In [71]:
print('accuracy on training data =',train_data_accuracy)

accuracy on training data = 0.9340659340659341


In [32]:
# Accuracy on testing data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(y_test,x_test_prediction)

In [72]:
print('accuracy on testing data =',test_data_accuracy)

accuracy on testing data = 0.9298245614035088


#### Here Logistic Regression performed the best with an accuracy of 92.98%, likely due to its ability to model linear relationships in the data. 

In [73]:
# Predict the model
input_data = (13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,0.05863,0.1839,2.342,1.17,14.16,0.004352,
              0.004899,0.01343,0.01164,0.02671,0.001777,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169)
#change input_data into a numpy array
input_data_as_numpy_array = np.asarray(input_data)

#reshape the numpy array as we are predicted for one datapoint
input_reshaped =input_data_as_numpy_array.reshape(1,-1)

In [74]:
prediction = model.predict(input_reshaped)
print(prediction)

[1]




In [75]:
print(prediction)
if (prediction[0]==0):
    print("Case is Malignant")

else:
    print("Case is Benign")


[1]
Case is Benign


In [81]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

### DecisionTree Classifier

In [92]:
#model building
model = DecisionTreeClassifier()

In [85]:
# Tunning Paramaters
parameters = {'min_samples_split': [2,3,4,5,6,7,8,9,10], 
              'min_samples_leaf':[2,3,4,5,6,7,8,9,10],
              'max_features': ['auto', 'sqrt', 'log2']}


In [84]:
# GridSearchCV
grid_search = GridSearchCV(model, parameters, cv=10) # For 10 Cross-Validation

grid_search.fit(X_train_sc, y_train) # Model Fitting
print("Best Score is ", grid_search.best_score_)
print("Best Estinator is ", grid_search.best_estimator_)
print("Best Parametes are", grid_search.best_params_)

Best Score is  0.9517874396135266
Best Estinator is  DecisionTreeClassifier(max_features='log2', min_samples_leaf=10,
                       min_samples_split=4)
Best Parametes are {'max_features': 'log2', 'min_samples_leaf': 10, 'min_samples_split': 4}


810 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
810 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\RAMEESHA\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\RAMEESHA\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\RAMEESHA\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\RAMEESHA\AppData\Local\Programs\Pytho

In [86]:
model = DecisionTreeClassifier(max_features = 'sqrt', min_samples_leaf=3, min_samples_split=5)
model.fit(X_train_sc, y_train)
y_pred_dt = model.predict(X_test_sc)
accuracy_dt = accuracy_score(y_test,y_pred_dt)
print("Accuracy on Test Data:",accuracy_dt)

Accuracy on Test Data: 0.9035087719298246


In [87]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88        45
           1       0.93      0.91      0.92        69

    accuracy                           0.90       114
   macro avg       0.90      0.90      0.90       114
weighted avg       0.90      0.90      0.90       114



In [88]:
confusion_matrix(y_test,y_pred_dt)
lr_cm = confusion_matrix(y_test, y_pred_dt)
lr_cm = pd.DataFrame(lr_cm, columns=['Benign', 'Malignant'], index=['Benign','Malignant'])
lr_cm

Unnamed: 0,Benign,Malignant
Benign,40,5
Malignant,6,63


## Random ForestClassifier

In [None]:

# Initialize RandomForestClassifier
model = RandomForestClassifier(random_state=42)

parameters = {
    'n_estimators': [100, 150, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'sqrt']
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=parameters, cv=10, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train_sc, y_train)

# Get the best model and parameters
print("Best Score:", grid_search.best_score_)
print("Best Estimator:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)

# Evaluate on the test set
best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_train_sc)

# Accuracy and classification report
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy on Test Data:", accuracy_rf)
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf)
rf_cm_df = pd.DataFrame(rf_cm, columns=['Benign', 'Malignant'], index=['Benign', 'Malignant'])
print("Confusion Matrix:\n", rf_cm_df)


In [99]:
# Initialize SVM
from sklearn.svm import SVC

model = SVC(random_state=42)

parameters = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=parameters, cv=10, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train_sc, y_train)

# Get the best model and parameters
print("Best Score:", grid_search.best_score_)
print("Best Estimator:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)

# Evaluate on the test set
best_svm_model = grid_search.best_estimator_
y_pred_svm = best_svm_model.predict(X_train_sc)

# Accuracy and classification report
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy on Test Data:", accuracy_svm)
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

# Confusion Matrix
svm_cm = confusion_matrix(y_test, y_pred_svm)
svm_cm_df = pd.DataFrame(svm_cm, columns=['Benign', 'Malignant'], index=['Benign', 'Malignant'])
print("Confusion Matrix:\n", svm_cm_df)


Best Score: 0.9801932367149758
Best Estimator: SVC(C=0.1, kernel='linear', random_state=42)
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


ValueError: Found input variables with inconsistent numbers of samples: [114, 455]

In [101]:
# Initialize k-NN
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

parameters = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=parameters, cv=10, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train_sc, y_train)

# Get the best model and parameters
print("Best Score:", grid_search.best_score_)
print("Best Estimator:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)

# Evaluate on the test set
best_knn_model = grid_search.best_estimator_
y_pred_knn = best_knn_model.predict(X_train_sc)

# Accuracy and classification report
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy on Test Data:", accuracy_knn)
print("Classification Report:\n", classification_report(y_test, y_pred_knn))

# Confusion Matrix
knn_cm = confusion_matrix(y_test, y_pred_knn)
knn_cm_df = pd.DataFrame(knn_cm, columns=['Benign', 'Malignant'], index=['Benign', 'Malignant'])
print("Confusion Matrix:\n", knn_cm_df)


  _data = np.array(data, dtype=dtype, copy=copy,


Best Score: 0.9692753623188406
Best Estimator: KNeighborsClassifier(metric='manhattan', n_neighbors=3)
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'uniform'}


ValueError: Found input variables with inconsistent numbers of samples: [114, 455]