<a href="https://colab.research.google.com/github/rajeevinr/ML-projects/blob/main/Breast_Cancer_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Dependencies

In [42]:
import numpy as np
import pandas as pd
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Processing

In [43]:
# loading the data from sklearn
breast_cancer_dataset = sklearn.datasets.load_breast_cancer()

In [44]:
print(breast_cancer_dataset)

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
 

In [45]:
# loading the data to a data frame
data_frame = pd.DataFrame(breast_cancer_dataset,columns = breast_cancer_dataset.feature_names)

In [46]:
# print the first 5 rows of dataframe
data_frame.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension


In [47]:
# adding the target column to the data frame
data_frame["label"] = breast_cancer_dataset.target

In [48]:
# printing the last 5 rows of dataframe
data_frame.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
564,,,,,,,,,,,...,,,,,,,,,,0
565,,,,,,,,,,,...,,,,,,,,,,0
566,,,,,,,,,,,...,,,,,,,,,,0
567,,,,,,,,,,,...,,,,,,,,,,0
568,,,,,,,,,,,...,,,,,,,,,,1


In [49]:
# number of rows and columns in the dataset
data_frame.shape

(569, 31)

In [50]:
# getting some information about the dataset
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   mean radius              0 non-null      object
 1   mean texture             0 non-null      object
 2   mean perimeter           0 non-null      object
 3   mean area                0 non-null      object
 4   mean smoothness          0 non-null      object
 5   mean compactness         0 non-null      object
 6   mean concavity           0 non-null      object
 7   mean concave points      0 non-null      object
 8   mean symmetry            0 non-null      object
 9   mean fractal dimension   0 non-null      object
 10  radius error             0 non-null      object
 11  texture error            0 non-null      object
 12  perimeter error          0 non-null      object
 13  area error               0 non-null      object
 14  smoothness error         0 non-null      o

In [51]:
# checking for missing values
data_frame.isnull().sum()

mean radius                569
mean texture               569
mean perimeter             569
mean area                  569
mean smoothness            569
mean compactness           569
mean concavity             569
mean concave points        569
mean symmetry              569
mean fractal dimension     569
radius error               569
texture error              569
perimeter error            569
area error                 569
smoothness error           569
compactness error          569
concavity error            569
concave points error       569
symmetry error             569
fractal dimension error    569
worst radius               569
worst texture              569
worst perimeter            569
worst area                 569
worst smoothness           569
worst compactness          569
worst concavity            569
worst concave points       569
worst symmetry             569
worst fractal dimension    569
label                        0
dtype: int64

In [52]:
# statistical measures about the data
data_frame.describe()

Unnamed: 0,label
count,569.0
mean,0.627417
std,0.483918
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [53]:
# checking the distribution of target variables
data_frame["label"].value_counts()

1    357
0    212
Name: label, dtype: int64

1 ---> Benign
0 ---> Malignant

In [54]:
data_frame.groupby("label").mean()

Unnamed: 0_level_0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,


Separating the features and Target

In [55]:
X = data_frame.drop(columns="label",axis=1)
Y = data_frame["label"]

In [56]:
print(X)

    mean radius mean texture mean perimeter mean area mean smoothness  \
0           NaN          NaN            NaN       NaN             NaN   
1           NaN          NaN            NaN       NaN             NaN   
2           NaN          NaN            NaN       NaN             NaN   
3           NaN          NaN            NaN       NaN             NaN   
4           NaN          NaN            NaN       NaN             NaN   
..          ...          ...            ...       ...             ...   
564         NaN          NaN            NaN       NaN             NaN   
565         NaN          NaN            NaN       NaN             NaN   
566         NaN          NaN            NaN       NaN             NaN   
567         NaN          NaN            NaN       NaN             NaN   
568         NaN          NaN            NaN       NaN             NaN   

    mean compactness mean concavity mean concave points mean symmetry  \
0                NaN            NaN               

In [57]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: label, Length: 569, dtype: int64


Splitting the data into training data and testing data

In [58]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [59]:
print(X.shape,X_train.shape,X_test.shape)

(569, 30) (455, 30) (114, 30)


Model Training

Logistic Regression

In [60]:
model = LogisticRegression()

In [61]:
# training the logistic regression model using training data

model.fit(X_train, Y_train)

ValueError: ignored

Model Evaluation

Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train,X_train_prediction)

In [None]:
print("Accuracy on training data: ", training_data_accuracy)

In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test,X_test_prediction)

In [None]:
print("Accuracy on test data: ",test_data_accuracy)

Building a Predictive System

In [None]:
input_data = (17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189)
# change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predicting for one datapoint
input_data_reshaped = input_data_as_numpy_array.reshaped(1,-1)

prediction = model.predict(input_data_reshaped)

print(prediction)

if prediction[0]==0:
  print("The Breast cancer is Malignant")
else:
  print("the Breast cancer is Benign")

  