In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [45]:
# Load the breast tumor dataset into a Pandas DataFrame
data = pd.read_csv(r'E:\MLBS\Project Breast Cancer\breast cancer.csv')

In [46]:
# explore the dataset using functions like 'info()', 'describe()', and 'head()'
print(data.info())  # Display basic information about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [47]:
print(data.describe())  # Display statistics of the dataset

                 id  radius_mean  texture_mean  perimeter_mean    area_mean  \
count  5.690000e+02   569.000000    569.000000      569.000000   569.000000   
mean   3.037183e+07    14.127292     19.289649       91.969033   654.889104   
std    1.250206e+08     3.524049      4.301036       24.298981   351.914129   
min    8.670000e+03     6.981000      9.710000       43.790000   143.500000   
25%    8.692180e+05    11.700000     16.170000       75.170000   420.300000   
50%    9.060240e+05    13.370000     18.840000       86.240000   551.100000   
75%    8.813129e+06    15.780000     21.800000      104.100000   782.700000   
max    9.113205e+08    28.110000     39.280000      188.500000  2501.000000   

       smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813     

In [48]:
print(data.head())  # Display the first few rows of the dataset

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

In [49]:
dataset = data.drop(columns=['id', 'Unnamed: 32'])

In [50]:
# Check for missing values
missing_values = dataset.isnull().sum()
print(missing_values)

diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [51]:
dataset['diagnosis'] = dataset['diagnosis'].map({'M': 1, 'B': 0})

In [52]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

columns_to_scale = dataset.columns[:-1]  # Exclude the last column (diagnosis)

# Apply the StandardScaler to the selected columns
dataset[columns_to_scale] = scaler.fit_transform(dataset[columns_to_scale])


In [53]:
# Split the dataset into features (X) and labels (y)
X = dataset.drop(columns=['diagnosis'])
y = dataset['diagnosis']

In [54]:
# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            # Update parameters using gradient descent
            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / num_samples) * np.sum(y_predicted - y)
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return y_predicted

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def evaluate(self, X_test, y_test):
        y_predicted = self.predict(X_test)
        accuracy = np.mean(y_predicted == y_test)
        return accuracy

In [56]:
def cost_function(self, y, y_predicted):
    num_samples = len(y)
    cost = - (1 / num_samples) * (np.dot(y, np.log(y_predicted)) + np.dot((1 - y), np.log(1 - y_predicted)))
    return cost


In [57]:
# Initialize and train the logistic regression model
model = LogisticRegression(learning_rate=0.01, num_iterations=1000)
model.fit(X_train, y_train)


In [58]:
# Make predictions on the test data
y_pred = np.array(model.predict(X_test))
y_pred_binary = (y_pred > 0.5).astype(int)


In [59]:
from sklearn.metrics import accuracy_score

# Convert y_pred_binary to integers if needed
y_pred_binary = y_pred_binary.astype(int)

# Convert y_test to integers
y_test = y_test.astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 97.37%


In [60]:
print(f"y_test shape: {y_test.shape}")
print(f"y_pred_binary shape: {y_pred_binary.shape}")
print(f"y_test data type: {y_test.dtype}")
print(f"y_pred_binary data type: {y_pred_binary.dtype}")


y_test shape: (114,)
y_pred_binary shape: (114,)
y_test data type: int32
y_pred_binary data type: int32


In [61]:
from sklearn.impute import SimpleImputer

# Initialize an imputer
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on training data
imputer.fit(X_train)

# Transform both the training and test data with the imputer
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)


In [62]:
from sklearn.svm import SVC

# Initialize the SVM classifier with a linear kernel
svm_classifier = SVC(kernel='linear')

y_train = y_train.astype(int)

# Train the SVM model on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_svm = svm_classifier.predict(X_test)

from sklearn.metrics import accuracy_score

# Calculate the accuracy of the SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm * 100:.2f}%")


SVM Accuracy: 96.49%


In [74]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

y_pred_nn = model.predict(X_test)

accuracy_nn = accuracy_score(y_test, (y_pred_nn > 0.5).astype(int))
print(f"Neural Network Accuracy: {accuracy_nn * 100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Accuracy: 98.25%


In [75]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy_logistic = accuracy_score(y_test, y_pred_binary)

# Calculate precision
precision_logistic = precision_score(y_test, y_pred_binary,zero_division=1.0)

# Calculate recall
recall_logistic = recall_score(y_test, y_pred_binary)

# Calculate F1-score
f1_logistic = f1_score(y_test, y_pred_binary)

# Print the evaluation metrics
print(f"Logistic Regression Accuracy: {accuracy_logistic * 100:.2f}%")
print(f"Logistic Regression Precision: {precision_logistic * 100:.2f}%")
print(f"Logistic Regression Recall: {recall_logistic * 100:.2f}%")
print(f"Logistic Regression F1-score: {f1_logistic * 100:.2f}%")


Logistic Regression Accuracy: 97.37%
Logistic Regression Precision: 97.62%
Logistic Regression Recall: 95.35%
Logistic Regression F1-score: 96.47%


In [76]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Calculate precision
precision_svm = precision_score(y_test, y_pred_svm)

# Calculate recall
recall_svm = recall_score(y_test, y_pred_svm)

# Calculate F1-score
f1_svm = f1_score(y_test, y_pred_svm)

# Print the evaluation metrics
print(f"SVM Accuracy: {accuracy_svm * 100:.2f}%")
print(f"SVM Precision: {precision_svm * 100:.2f}%")
print(f"SVM Recall: {recall_svm * 100:.2f}%")
print(f"SVM F1-score: {f1_svm * 100:.2f}%")


SVM Accuracy: 96.49%
SVM Precision: 93.33%
SVM Recall: 97.67%
SVM F1-score: 95.45%


In [77]:
# Assuming y_pred_nn is your continuous predictions
threshold = 0.5  # Set a threshold for binary classification

# Binarize the predictions
y_pred_nn_binary = (y_pred_nn > threshold).astype(int)

# Now, you can calculate classification metrics with the binarized predictions
accuracy_neural = accuracy_score(y_test, y_pred_nn_binary)
precision_neural = precision_score(y_test, y_pred_nn_binary)
recall_neural = recall_score(y_test, y_pred_nn_binary)
f1_neural = f1_score(y_test, y_pred_nn_binary)

# Print the evaluation metrics
print(f"Neural Network Accuracy: {accuracy_neural * 100:.2f}%")
print(f"Neural Network Precision: {precision_neural * 100:.2f}%")
print(f"Neural Network Recall: {recall_neural * 100:.2f}%")
print(f"Neural Network F1-score: {f1_neural * 100:.2f}%")


Neural Network Accuracy: 98.25%
Neural Network Precision: 97.67%
Neural Network Recall: 97.67%
Neural Network F1-score: 97.67%


In [78]:
# Results of Logistic Regression
print("Logistic Regression:")
print(f"Accuracy: {accuracy_logistic * 100:.2f}%")
print(f"Precision: {precision_logistic * 100:.2f}%")
print(f"Recall: {recall_logistic * 100:.2f}%")
print(f"F1-score: {f1_logistic * 100:.2f}%")
print()

# Results of SVM
print("SVM:")
print(f"Accuracy: {accuracy_svm * 100:.2f}%")
print(f"Precision: {precision_svm * 100:.2f}%")
print(f"Recall: {recall_svm * 100:.2f}%")
print(f"F1-score: {f1_svm * 100:.2f}%")
print()

# Results of Neural Network
print("Neural Network:")
print(f"Accuracy: {accuracy_neural * 100:.2f}%")
print(f"Precision: {precision_neural * 100:.2f}%")
print(f"Recall: {recall_neural * 100:.2f}%")
print(f"F1-score: {f1_neural * 100:.2f}%")
print()

# Conclusion
if accuracy_logistic > accuracy_svm and accuracy_logistic > accuracy_neural:
    print("Logistic Regression performed the best.")
elif accuracy_svm > accuracy_logistic and accuracy_svm > accuracy_neural:
    print("SVM performed the best.")
else:
    print("Neural Network performed the best.")


Logistic Regression:
Accuracy: 97.37%
Precision: 97.62%
Recall: 95.35%
F1-score: 96.47%

SVM:
Accuracy: 96.49%
Precision: 93.33%
Recall: 97.67%
F1-score: 95.45%

Neural Network:
Accuracy: 98.25%
Precision: 97.67%
Recall: 97.67%
F1-score: 97.67%

Neural Network performed the best.
