In [1]:
# Breast Cancer Prediction using Logistic Regression
# --------------------------------------------------
# This project predicts whether a tumor is malignant (M) or benign (B)
# based on the Breast Cancer dataset.
# Each step includes explanations for beginners to understand easily.

# Import required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [19]:
# Step 1: Load the dataset
data = pd.read_csv("Breast_Cancer_Data.csv")

# Step 2: Display first few rows of the dataset
data


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


In [3]:
# Step 3: Check how many samples belong to each diagnosis type (B or M)
print(data.diagnosis.value_counts())  # or data["diagnosis"].value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64


In [4]:
# Step 4: Check the shape of the dataset (rows, columns)
print(data.shape)

(569, 33)


In [5]:
# Step 5: Check if there are any missing values in the dataset
print(data.isnull().sum())

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

In [6]:
# Step 6: Check for duplicate rows
print(data.duplicated().sum())

0


In [7]:
# Step 7: Get general information about the dataset
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [8]:
# Step 8: Remove unnecessary column 'Unnamed: 32' (if present)
data.drop("Unnamed: 32", axis=1, inplace=True)

# Step 9: Describe the dataset (mean, std, min, max etc.)
print(data.describe())

                 id  radius_mean  texture_mean  perimeter_mean    area_mean  \
count  5.690000e+02   569.000000    569.000000      569.000000   569.000000   
mean   3.037183e+07    14.127292     19.289649       91.969033   654.889104   
std    1.250206e+08     3.524049      4.301036       24.298981   351.914129   
min    8.670000e+03     6.981000      9.710000       43.790000   143.500000   
25%    8.692180e+05    11.700000     16.170000       75.170000   420.300000   
50%    9.060240e+05    13.370000     18.840000       86.240000   551.100000   
75%    8.813129e+06    15.780000     21.800000      104.100000   782.700000   
max    9.113205e+08    28.110000     39.280000      188.500000  2501.000000   

       smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813     

In [9]:
# Step 10: Check again how many malignant (M) and benign (B) cases exist
print(data.diagnosis.value_counts())

diagnosis
B    357
M    212
Name: count, dtype: int64


In [10]:
# Step 11: Encode the 'diagnosis' column (Convert text labels into numbers)
# M = 1 (Malignant / Cancerous), B = 0 (Benign / Not Cancerous)
le = LabelEncoder()
label_data = data.copy()
label_data['diagnosis'] = le.fit_transform(label_data['diagnosis'])

In [11]:
# Step 12: Verify encoding result (1 = M, 0 = B)
print(label_data.diagnosis.value_counts())

diagnosis
0    357
1    212
Name: count, dtype: int64


In [12]:
# Step 13: Split the dataset into features (X) and target (y)
X = label_data.drop('diagnosis', axis=1)
y = label_data['diagnosis']

# Step 14: Split the dataset into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(455, 31)
(114, 31)


In [13]:
# Step 15: Standardize the data (scaling all features to same range)
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# Step 16: Create a Logistic Regression model
lg = LogisticRegression()

# Step 17: Train (fit) the model using the training data
lg.fit(X_train, y_train)

In [14]:
# Step 18: Predict outcomes using the test data
y_predict = lg.predict(X_test)

# Step 19: Check prediction results
print(y_predict)

[0 1 1 0 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1
 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0 1 1 0 1
 0 0 0 0 0 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 1 0 0 1
 0 1 1]


In [16]:
# Step 20: Evaluate model performance using accuracy score
print("Accuracy Score:", accuracy_score(y_test, y_predict))


Accuracy Score: 0.9736842105263158


In [17]:
# Step 21: Test the model on a single example input (scaled values)
input_text = (-0.23702031,  1.97409619,  1.73302577,  2.09167167,  1.85197292,
        1.319843  ,  3.42627493,  2.01311199,  2.66503199,  2.1270036 ,
        1.55839569,  0.80531919, -0.81268678,  0.75195659,  0.87716951,
       -0.89605315,  1.18122247,  0.18362761,  0.60059598, -0.31771686,
        0.52963649,  2.17331385,  1.3112795 ,  2.08161691,  2.1374055 ,
        0.76192793,  3.26560084,  1.92862053,  2.6989469 ,  1.89116053,
        2.49783848)

# Convert the input into numpy array format
np_df = np.asarray(input_text)

# Reshape the array because we are predicting only one sample
prediction = lg.predict(np_df.reshape(1, -1))

# Step 22: Display prediction result
if prediction[0] == 1:
    print("Cancerous")
else:
    print("Not Cancerous")

Cancerous
