## Load the Libraries

In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
np.bool = np.bool_

---

## Split the Data into Training and Testing Sets

### Read the `breast_cancer_dataset.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = 'Resources/breast_cancer_dataset.csv'
df = pd.read_csv(file_path)

# Review the DataFrame
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Transform "diagnosis" column with encoding function

In [3]:
# Encoding the diagnosis column using a custom function
def encode_diagnosis(diagnosis):
    
    #This function encodes diagnosis by setting yes as 1 and no as 0.
    if diagnosis == "M":
        return 1
    else:
        return 0

# Call the encode_diagnosis function on the diagnosis column
df["diagnosis"] = df["diagnosis"].apply(encode_diagnosis)

# Review the DataFrame 
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Step 2: Separate the Features (X) from the Target (y)

In [4]:
# Separate the y variable, the labels
y = df['diagnosis']

# Separate the X variable, the features
X = df.drop(columns=['diagnosis', 'id'])

In [5]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Split the data into training and testing datasets by using `train_test_split`.

In [6]:
# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
X_train.shape

(426, 30)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [7]:
# Instantiate the Logistic Regression model

#Try doing 50, 100, 200, 500 iterations
logistic_regression_model = LogisticRegression(solver='lbfgs', max_iter=200, random_state=42)
logistic_regression_model

In [8]:
# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [9]:
# Make a prediction using the testing data
predictions = lr_model.predict(X_test)

### Make predictions

In [10]:
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,1
3,0,0
4,0,0
...,...,...
138,1,1
139,1,1
140,0,0
141,0,0


### Calculate the Accuracy Score

In [11]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.951048951048951

### Evaluate the model’s performance by generate a confusion matrix and printing the classification report.

In [12]:
# Generate a confusion matrix for the model
training_matrix = confusion_matrix(y_test, predictions)
print(training_matrix)

[[89  1]
 [ 6 47]]


In [13]:
# Print the classification report for the model
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.99      0.96        90
           1       0.98      0.89      0.93        53

    accuracy                           0.95       143
   macro avg       0.96      0.94      0.95       143
weighted avg       0.95      0.95      0.95       143



### How well does the logistic regression model predict both the `B` (benign) and `M` (malignant) labels?

**Overall, the model seems to perform very well, with high precision, recall, and F1-scores for both classes, and a solid accuracy:**

**Precision:** This measures the accuracy of the positive predictions. For class B, it's 0.95, meaning that when the model predicts class B, it is correct 95% of the time. For class M, it's 0.93.

**Recall:** This measures the model's ability to identify all relevant instances. For class B, it's 0.95, meaning that the model correctly identifies 95% of all actual instances of class B. For class M, it's 0.93.

**F1-score:** This is the harmonic mean of precision and recall, providing a single metric that balances both concerns. For both classes B and M, the F1-scores are 0.95 and 0.93, respectively.

**Support:** This indicates the number of actual occurrences of the class in the dataset. Class B has 88 instances, and class M has 55 instances.

**Accuracy:** This is the overall accuracy of the model across all classes. Here, it's 0.94, meaning that the model is correct 94% of the time overall.

**Macro average:** This calculates the average performance across classes, treating all classes equally. Both precision, recall, and F1-score are 0.94 here.

**Weighted average:** This takes into account the support for each class when calculating the averages. The weighted averages of precision, recall, and F1-score are all 0.94, reflecting a balanced performance relative to the class distributions.