# **Logistic Regression Model**

## Load the Libraries

In [2]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
np.bool = np.bool_

In [3]:
import os
# Find the latest version of Spark 3.x from https://downloads.apache.org/spark/ and enter as the spark version
spark_version = 'spark-3.5.2'
os.environ['SPARK_VERSION'] = spark_version
# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark
# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"
# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Ign:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [998 kB]
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,293 kB]
Get:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Hit:12 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:13 http://archive.ubuntu.com/ubuntu jammy-backports InRelease


In [4]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("breastCancerAnalysis").getOrCreate()

In [5]:
# Read in data from S3 Buckets or local file
from pyspark import SparkFiles
url = "Resources/breast_cancer_dataset.csv"
spark.sparkContext.addFile(url)
df = spark.read.option('header', 'true').csv(SparkFiles.get("breast_cancer_dataset.csv"), inferSchema=True, sep=',')
# Show DataFrame
df.show()

+--------+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+
|      id|diagnosis|radius_mean|texture_mean|perimeter_mean|area_mean|smoothness_mean|compactness_mean|concavity_mean|concave points_mean|symmetry_mean|fractal_dimension_mean|radius_se|texture_se|perimeter_se|area_se|smoothness_se|compactness_se|concavity_se|concave points_se|symmetry_se|fractal_dimension_se|radius_worst|texture_worst|perimeter_worst|area_worst|smoothness_worst|compactness_worst|concavity_worst|concave points_worst|symmetry_worst|fractal_dimension_worst|
+--------+---------+-----------+------------+---

In [6]:
# Convert Spark DataFrame to Pandas DataFrame for compatibility with scikit-learn
df = df.toPandas()

---

## Split the Data into Training and Testing Sets

In [7]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = 'Resources/breast_cancer_dataset.csv'
df = pd.read_csv(file_path)

# Review the DataFrame
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Transform "diagnosis" column with encoding function

In [8]:
# Encoding the diagnosis column using a custom function
def encode_diagnosis(diagnosis):

    #This function encodes diagnosis by setting yes as 1 and no as 0.
    if diagnosis == "M":
        return 1
    else:
        return 0

# Call the encode_diagnosis function on the diagnosis column
df["diagnosis"] = df["diagnosis"].apply(encode_diagnosis)

# Review the DataFrame
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Separate the Features (X) from the Target (y)

In [9]:
# Separate the y variable, the labels
y = df['diagnosis']

# Separate the X variable, the features
X = df.drop(columns=['diagnosis', 'id'])

In [10]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Split the data into training and testing datasets by using `train_test_split`.

In [11]:
# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
X_train.shape

(426, 30)

## Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [12]:
# Instantiate the Logistic Regression model

#Try doing 50, 100, 200, 500 iterations
logistic_regression_model = LogisticRegression(solver='lbfgs', max_iter=200, random_state=42)
logistic_regression_model

In [13]:
# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [14]:
# Make a prediction using the testing data
predictions = lr_model.predict(X_test)

## Make predictions

In [15]:
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,1
3,0,0
4,0,0


## Calculate the Accuracy Score

In [16]:
# Display the accuracy score for the test dataset.
accuracy_score = accuracy_score(y_test, predictions)
print(f"Accuracy of Logistic Regression Model is: {accuracy_score:.4f}")

Accuracy of Logistic Regression Model is: 0.9510


## Evaluate the model’s performance by generate a confusion matrix and printing the classification report.

In [17]:
# Generate a confusion matrix for the model
training_matrix = confusion_matrix(y_test, predictions)
print(training_matrix)

[[89  1]
 [ 6 47]]


In [18]:
# Print the classification report for the model
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.99      0.96        90
           1       0.98      0.89      0.93        53

    accuracy                           0.95       143
   macro avg       0.96      0.94      0.95       143
weighted avg       0.95      0.95      0.95       143



# **Classification Report Analysis**


The classification report provides a detailed evaluation of the logistic regression model's performance on each class. Here’s a breakdown of the performance metrics based on the provided classification report:

### Class 0:
- **Precision**: 0.94
  - Indicates that 94% of the samples predicted as class 0 are actually class 0. Precision measures the proportion of true positives among all predicted positives for class 0.

- **Recall**: 0.99
  - Shows that the model correctly identifies 99% of the actual class 0 samples. Recall measures the proportion of true positives among all actual positives for class 0.

- **F1-Score**: 0.96
  - The harmonic mean of precision and recall for class 0. An F1-score of 0.96 indicates very good performance, balancing both precision and recall.

### Class 1:
- **Precision**: 0.98
  - Indicates that 98% of the samples predicted as class 1 are actually class 1. The model is highly accurate in identifying class 1 when it makes a prediction.

- **Recall**: 0.89
  - Shows that the model correctly identifies 89% of the actual class 1 samples. While still strong, recall for class 1 is lower compared to class 0.

- **F1-Score**: 0.93
  - Reflects a balance between precision and recall for class 1. An F1-score of 0.93 is slightly lower than for class 0 but still indicates good overall performance.

### Overall Metrics:
- **Accuracy**: 0.95
  - The model correctly classifies 95% of all samples, indicating high overall performance.

- **Macro Average**:
  - **Precision**: 0.96
  - **Recall**: 0.94
  - **F1-Score**: 0.95
  - The macro average is the average of the precision, recall, and F1-scores for each class, treating all classes equally. It reflects balanced performance across both classes.

- **Weighted Average**:
  - **Precision**: 0.95
  - **Recall**: 0.95
  - **F1-Score**: 0.95
  - The weighted average accounts for the support (number of true instances) of each class, providing an overall performance metric weighted by class distribution.

### Summary:
- **Class 0**: The model has high recall (0.99), meaning it identifies most class 0 samples correctly. Precision is also high (0.94), indicating few false positives.
  
- **Class 1**: The model shows high precision (0.98) and good recall (0.89), correctly identifying most class 1 samples with a small number of false positives.

Overall, the logistic regression model demonstrates strong performance across both classes. It is particularly effective at predicting class 0 and performs very well with class 1, though recall for class 1 is slightly lower.


# **Improving Accuracy**

## Method 1: Using Top 5 Features:

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the top 5 features based on importance
top_features = ['area_worst', 'concave points_worst', 'concave points_mean', 'radius_worst', 'concavity_mean']

# Filter the DataFrame to include only the top 5 features
X_top5 = df[top_features]
y = df['diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_top5, y, random_state=42, stratify=y)

# Initialize and train the Logistic Regression model
logistic_regression_model = LogisticRegression(solver='lbfgs', max_iter=200, random_state=42)
lr_model = logistic_regression_model.fit(X_train, y_train)

# Make predictions and evaluate accuracy
predictions = lr_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Accuracy with top 5 important features: {accuracy:.4f}")

Accuracy with top 5 important features: 0.9231


## Method 2: Include Interaction Terms:

In [20]:
from sklearn.preprocessing import PolynomialFeatures

# Define the features to use
features = [
    'area_worst', 'concave points_worst', 'concave points_mean', 'radius_worst', 'concavity_mean'
]

# Create interaction features
X_interactions = df[features]
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
X_interactions_poly = poly.fit_transform(X_interactions)

# Convert the transformed data back to a DataFrame with meaningful column names
X_interactions_df = pd.DataFrame(X_interactions_poly, columns=poly.get_feature_names_out(features))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_interactions_df, y, random_state=42, stratify=y)

# Initialize and train the Logistic Regression model
logistic_regression_model = LogisticRegression(solver='lbfgs', max_iter=200, random_state=42)
lr_model = logistic_regression_model.fit(X_train, y_train)

# Make predictions and evaluate accuracy
predictions = lr_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Accuracy with interaction terms: {accuracy:.4f}")

Accuracy with interaction terms: 0.9720


# **Interaction Terms Explained**

#### 1. Pick the original features:
- You start with a few columns from your data, like `area_worst`, `concave points_worst`, and others.
- These features represent important information for your model.

#### 2. Set up the `PolynomialFeatures` tool:
- The `PolynomialFeatures` tool helps you create new features from the existing ones.
- By using `interaction_only=True`, you're telling the tool, "I only want new features that represent interactions between pairs of features, like multiplying one by another." It doesn't create features like squares or cubes of a single feature, just interactions between two features.
- `include_bias=False` means you're not adding a constant "1" as a feature, which logistic regression will do on its own later.

#### 3. Create the interaction terms:
- The `poly.fit_transform(X_interactions)` part takes your original features and creates new ones by multiplying pairs of them together.
    - For example, it might multiply `area_worst` by `concave points_worst` to create a new feature.
- It does this for all possible pairs of features from your list, so you'll get several new features that represent these interactions.

Think of it as creating extra "teamwork" features. If two original features work together to influence the outcome, interaction terms will capture that combined effect, which helps the model understand complex relationships better.
