### Import Libraries

In [2]:
#data analysis tools
import pandas as pd
import numpy as np
import statistics

#visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

#sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    auc,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.model_selection import cross_val_score


### Data Cleaning

In [4]:
df = pd.read_csv("E:\\breast_cancer_wisconsin_dataset.csv")
df.shape

PermissionError: [Errno 13] Permission denied

In [None]:
df.head()

In [None]:
df.columns

**Renaming columns to improve readability and ensure consistency.**  

In [None]:
rename_dictionary = {
    'id':'ID',
    'diagnosis':'Diagnosis',
    'radius_mean':'Mean Radius',
    'texture_mean':'Mean Texture',
    'perimeter_mean':'Mean Perimeter',
    'area_mean':'Mean Area',
    'smoothness_mean':'Mean Smoothness',
    'compactness_mean':'Mean Compactness',
    'concavity_mean':'Mean Concavity',
    'concave points_mean':'Mean Concave Points',
    'symmetry_mean':'Mean Symmetry',
    'fractal_dimension_mean':'Mean Fractal Dimension',
    'radius_se':'Standard Error Radius',
    'texture_se':'Standard Error Texture',
    'perimeter_se':'Standard Error Perimeter',
    'area_se':'Standard Error Area',
    'smoothness_se':'Standard Error Smoothness',
    'compactness_se':'Standard Error Compactness',
    'concavity_se':'Standard Error Concavity',
    'concave points_se':'Standard Error Concave Points',
    'symmetry_se':'Standard Error Symmetry',
    'fractal_dimension_se':'Standard Error Fractal Dimension',
    'radius_worst':'Worst Radius',
    'texture_worst':'Worst Texture',
    'perimeter_worst':'Worst Perimeter',
    'area_worst':'Worst Area',
    'smoothness_worst':'Worst Smoothness',
    'compactness_worst':'Worst Compactness',
    'concavity_worst':'Worst Concavity',
    'concave points_worst':'Worst Concave Points',
    'symmetry_worst':'Worst Symmetry',
    'fractal_dimension_worst':'Worst Fractal Dimension'
}

In [None]:
df = df.rename(columns = rename_dictionary)

In [None]:
print(df.columns)

In [None]:
df = df.drop('ID', axis = 1)

In [None]:
df = df.drop('Unnamed: 32', axis = 1)

In [None]:
print(df.columns)

**Replacing the M/B values with 1/0. (1 = Malignant (M), 0 = Benign (B))**

In [None]:
df['Diagnosis'] = df['Diagnosis'].replace({'M':1, 'B':0})

In [None]:
df.head()
#Confimation that the M/B (Malignant/Benign) has been replaced with 1/0, respectively.

**Checking the DataFrame for Null-Values**

In [None]:
print(df.isna().sum())

In [None]:
print('Number of Duplicated Rows: ')
print(df.duplicated().sum())

In [None]:
print(df.info())

In [None]:
df.describe()

### Purpose of Standardization

I am going to standardize the numerical features of the dataset. Standardization transforms each feature to have a mean of 0 and a standard deviation of 1, allowing all features to be on the same scale. This is important as this dataset has 30 numerical features, some with extraordinary ranges (eg. Mean Radius which ranges from 143.5 to 2501, while Mean Smoothness ranges from 0.05263 to 0.1634). Without standardization, the features with larger values could possibly skew the results which could lead to inaccurate predictions.
This process may permit better modeling and visualizations.


In [None]:
numerical_cols = df.columns.drop('Diagnosis')

In [None]:
df_standardized = df.copy()

In [None]:
scaler = StandardScaler()
df_standardized[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [None]:
df_standardized.head()

In [None]:
df_standardized.describe()

**Visualizing the Effect of Standardization** -- *Mean Radius*

In [None]:
sns.boxplot(y = df['Mean Radius'])
plt.title('Mean Radius (Before Standardization)')

In [None]:
sns.boxplot(y = df_standardized['Mean Radius'])
plt.title('Mean Radius (After Standardization)')

### Visualizations

**Description**
-- The goal of this bar plot is to show the number of benign (0) and malignant (1) tumor samples. 

**Importance**
-- There is a greater number of benign samples than malignant samples.

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x = 'Diagnosis', data = df, color = 'silver')
plt.title('Count of Benign (0) vs. Malignant (1) Diagnosis')
plt.xlabel('Diagnosis (0 = Benign, 1 = Malignant')
plt.ylabel('Count')
plt.savefig('diagnosis_count.png', dpi=300)

**Description**
-- The goal of this histogram is to show the distribution of Mean Radius Values across all samples.

**Importance**
-- Mean Radius will likely be a key feature in measuring tumor size. There is a large range, which suggests that there may be variation between benign and malignant tumors. 

In [None]:
plt.hist(df['Mean Radius'], bins = 20,  edgecolor = 'black')
plt.grid(linestyle = '--', alpha = 0.7)
plt.yticks(range(0, 101, 10))
plt.title('Distribution of Mean Radius')
plt.xlabel('Mean Radius')
plt.ylabel('Frequency')
plt.savefig('mean_radius_hist.png', dpi = 300)

**Description**
-- The goal of this _interactive_ histogram is to demonstrate show the distribution of Mean Area values across all samples using _Plotly_.

**Importance** 
-- Mean Area will likely be a key feature in measuring tumor size. There is a large range, which suggests that there may be variation between benign and malignant tumors. This Plotly interactive highlights outliers that may impact diagnosis predictions, which may require the use of standardization. 

In [None]:
pip install kaleido

In [None]:
fig1 = px.histogram(df, x = 'Mean Area', nbins = 20, title = 'Distribution of Mean Area')
fig1.update_layout(xaxis_title = 'Mean Area', yaxis_title = 'Frequency')
fig1.write_image('mean_area_hist.png', width = 800, height = 600, scale = 2)
fig1.show()

**Description**
-- The goal of this boxplot is to demonstrate show the range, median, and outliers of the Mean Texture values across all samples using _Seaborn_.

**Importance** 
-- Mean Texture measures cell surface variability. The box plot reveals its spread and outliers, which provides insight into a feature that may contribute to diagnosis, setting up comparisons by class.

In [None]:
plt.figure(figsize = (6,4))
sns.boxplot(y = 'Mean Texture', data = df)
plt.title('Box Plot of Mean Texture')
plt.savefig('mean_texture_box.png')
plt.show()

**Description**
-- The goal of this bar plot is to demonstrate show the average values of Mean Radius, Mean Area, and Mean Concavity.

**Importance** 
-- By creating a comparison visual of key features allows for quick comparison of scale and possible importance. Here, there is a high average for Mean Area, further suggesting a significance of this feature.

In [None]:
key_features = ['Mean Radius', 'Mean Area', 'Mean Concavity']
means = df[key_features].mean()
plt.figure(figsize = (8,4))
plt.bar(key_features, means, color = 'green')
plt.title('Mean Values of Key Features')
plt.ylabel('Mean Value')
plt.yticks(range(0, 701, 50))
plt.grid(linestyle = '--', alpha = 0.7)
plt.savefig('key_features_means.png')
plt.show()

**Description**
-- The goal of this boxplot is to compare the Mean Radius values for benign (0) and malignant (1) tumors. 

**Importance** 
-- Greater mean radius value for malignant tumors indicate that tumor size is a key differentiator, which supports the hypothesis that features can be used to predict a diagnosis.

In [None]:
plt.figure(figsize = (6,4))
sns.boxplot(x = 'Diagnosis', y = 'Mean Radius', data = df)
plt.title('Mean Radius by Diagnosis')
plt.xlabel('Diagnosis (0 = Benign, 1 = Malignant)')
plt.savefig('mean_radius_by_diagnosis')
plt.show()

**Description**
-- The goal of this Violin Plot is to compare the Worst Concave Point values for benign (0) and malignant (1) tumors. 

**Importance** 
-- Researching the key features within the dataset, Worst Concave Points measure tumor shape irregularity. Here the wider shape of malignant tumors may indicate a greater predictor for malignancy. 

In [None]:
plt.figure(figsize = (6,4))
sns.violinplot(x = 'Diagnosis', y = 'Worst Concave Points', data = df)
plt.title('Worst Concave Points by Diagnosis')
plt.xlabel('Diagnosis (0 = Benign, 1 = Malignant)')
plt.savefig('worst_concave_points_violinplot.png')
plt.show()

**Description**
-- The goal of this Scatter Plot is to compare Mean Radius vs. Mean Area values for benign (0) and malignant (1) tumors, colored by Diagnosis. 

**Importance** 
-- There is a clear separation of benign and malignant points which demonstrates that size-related features can be used to facilitate diagnosis. 

In [None]:
fig2 = px.scatter(df, x = 'Mean Radius', y = 'Mean Area', color = 'Diagnosis',
                 title = 'Mean Radius vs. Mean Area by Diagnosis',
                 labels = {'Diagnosis':'Diagnosis (0 = Benign, 1 = Malignant)'})
fig2.write_image('mean_radius_vs_area.png')
fig2.show()

**Description**
-- This pair plot shows pairwise relationships between Mean Radius, Mean Texture, Mean Concavity, and Worst Concave Points, colored by Diagnosis. 

**Importance** 
-- The goal here was to see feature interactions and class separation, which highlights which combinations are most predictive. 

In [None]:
top_features = ['Mean Radius', 'Mean Texture', 'Mean Concavity', 'Worst Concave Points', 'Diagnosis']
sns.pairplot(df[top_features], hue = 'Diagnosis')
plt.suptitle('Pair Plot of Key Features by Diagnosis')
plt.savefig('pair_plot.png')
plt.show()

In [None]:
plt.figure(figsize = (10,8))
sns.heatmap(df.corr(), cmap = 'coolwarm', vmin = -1, vmax = 1)
plt.title('Correlation Heatmap of All Features')
plt.savefig('correlation_heatmap.png')
plt.show()

In [None]:
correlations = df.corr()['Diagnosis'].drop('Diagnosis').sort_values()
print(correlations)

In [None]:
plt.figure(figsize = (10, 6))
correlations.plot(kind = 'bar')
plt.title('Correlation of Features with Diagnosis')
plt.ylabel('Correlation Coefficient')
plt.savefig('diagnosis_correlations.png')
plt.show()

## Advanced Analysis

In this section, the goal is 

In [None]:
X = df.drop('Diagnosis', axis = 1)
X_scaled = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components = 3)
X_pca = pca.fit_transform(X_scaled)
pca_df = pd.DataFrame(X_pca, columns = ['PC1', 'PC2', 'PC3'])
pca_df['Diagnosis'] = df['Diagnosis']

#### 2D PCA Scatter Plot

**Description**
-- This scatter plot shows the first two principal components (PC1, PC2) colored by diagnosis.

**Importance** 
-- PCA reduces 30 features to 2D, clearly demonstrating class separation. It is evidence of clustering of benign vs. malignant points which support the hypothesis that features can be used to distinguish diagnoses. 

In [None]:
sns.scatterplot(x = 'PC1', y = 'PC2', hue = 'Diagnosis', data = pca_df)
plt.title('PCA: PC1 vs. PC2 by Diagnosis')
plt.savefig('pca_2d.png')
plt.show()

#### 3D PCA Scatter Plot

**Description**
-- This is an *interactive* 3D scatter plot which shows PC1, PC2, and PC3, colored by diagnosis.

**Importance** 
-- The 3D view enhances class separation visualization. The interactivity is an added touch, and permits exploration of the clusters, reinforcing that features can be used to predict diagnosis. 

In [None]:
fig3 = px.scatter_3d(pca_df, x = 'PC1', y = 'PC2', z = 'PC3', color = 'Diagnosis', 
                     title = '3-Dimensional Scatter Plot by Diagnosis',
                     labels = {'Diagnosis': 'Diagnosis (0 = Benign, 1 = Malignant'})
fig3.write_image('pca_3d.png')
fig3.show()

#### PCA Explained Variance Ratio

**Description**
-- This bar chart shows the variance explained by the first three principal components.

**Importance** 
-- High variance is a good indicator that a few components capture the most information, further validating using PCA's in visualizating predictive patterns.

In [None]:
plt.figure(figsize = (6,4))
plt.bar(range(1,4), pca.explained_variance_ratio_, color = 'green')
plt.title('PCA Explained Variance Ratio')
plt.xlabel('Principal Component')
plt.ylabel('Variance Ratio')

plt.savefig('pca_variance.png')
plt.show()

In [None]:
model = LogisticRegression()
model.fit(X_scaled, df['Diagnosis'])

#### Feature Importance From Logistic Regression

**Description**
-- This bar plot shows the coefficients of each feature in a logistic regression model. 

**Importance** 
-- The largest coefficients are greater indicators/predictors of which features drive diagnosis.

In [None]:
importance = pd.Series(model.coef_[0], index = X.columns).sort_values()

plt.figure(figsize=(10,6))
tango.plot(kind = 'bar')
plt.title('Feature Importance from Logistic Regression')
plt.ylabel('Coefficient')

plt.savefig('feature_importance.png')
plt.show()

#### Distribution of PC1, PC2, and PC3 by Diagnosis

**Description**
-- These histograms show the distribution of PC1, PC2, and PC3 Values for benign and malignant tumors.

**Importance** 
-- PC1 captures the most variance. Distinct distributions for each class suggest that the PCA-derived features are predictive. 

In [None]:
plt.figure(figsize = (10, 6))
sns.histplot(data = pca_df, x = 'PC1', hue = 'Diagnosis', bins = 20)

plt.title('Distribution of PC1 by Diagnosis')
plt.xlabel('PC1')
plt.savefig('pc1_distribution.png')

plt.show()

In [None]:
plt.figure(figsize = (10, 6))
sns.histplot(data = pca_df, x = 'PC2', hue = 'Diagnosis', bins = 20)

plt.title('Distribution of PC2 by Diagnosis')
plt.xlabel('PC2')
plt.savefig('pc2_distribution.png')

plt.show()

In [None]:
plt.figure(figsize = (10, 6))
sns.histplot(data = pca_df, x = 'PC3', hue = 'Diagnosis', bins = 20)

plt.title('Distribution of PC3 by Diagnosis')
plt.xlabel('PC3')
plt.savefig('pc3_distribution.png')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Diagnosis', y='PC2', data=pca_df)

plt.title('PC2 by Diagnosis')
plt.xlabel('Diagnosis (0 = Benign, 1 = Malignant)')
plt.ylabel('PC2')

plt.savefig('pc2_box.png')
plt.show()

In [None]:
df['Diagnosis'].value_counts()

## Model Performance

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df['Diagnosis'], test_size = 0.2, random_state = 42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

#### Confusion Matrix

**Description**
-- This heatmap shows the confusion matrix for the Logistic Regression Model's predictions.

**Importance** 
-- High true positives and negatives (the model correctly classifing benign/malignancy) demonstrate the model's accuracy, further demonsttrating that feature analysis can be used in diagnosis. 

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize = (6,4))

sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'Blues')
plt.title('Confusion Matrix for Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.savefig('confusion_matrix.png')
plt.show()

#### ROC Curve

**Description**
-- This plot shows the Receiver Operative Characteristic (ROC) curve **and** Area Under the Curve (AUC) for the model. 

**Importance** 
--  An ROC Curve is a graphical representativion of the performance of a classification model. It essentially shows the trade-off between the sensitivity (true postive rate (tpr) and specificity (false positivity rate (fpr)) at different thresholds. ROC curves are useful tools for its ability to make evaluations. Here, the ROC curve is being used to determine the optimal threshold for distinguishing between bengin vs malignant tumors. 

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_prob)

roc_auc = auc(fpr, tpr)

plt.figure(figsize = (6,4))

plt.plot(fpr, tpr, label = f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')

plt.title('ROC Curve for Logistic Regression')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()

plt.savefig('roc_curve.png')
plt.show()

#### Prediction Probability Distribution

**Description**
-- This is an interative histogram which shows the models predicted probabilities for malignant tumors, colored by actual Diagnosis.

**Importance** 
--  Well-separated probability distributions for benign and malignant cases confirm the model’s ability to distinguish classes, which supports the hypothesis.

In [None]:
pred_df = pd.DataFrame({'Probability': y_prob, 'Diagnosis': y_test})

fig4 = px.histogram(pred_df, x='Probability', color='Diagnosis', nbins=20,
                   title='Prediction Probability Distribution')

fig4.update_layout(xaxis_title='Predicted Probability of Malignant', yaxis_title='Count')
fig4.write_image('prediction_distribution.png')
fig4.show()

In [None]:
top_features = importance.abs().sort_values(ascending = False).head(4).index
for i, feature in enumerate(top_features, 21):
    plt.figure(figsize = (10,6))
    sns.boxplot(x = 'Diagnosis', y = feature, data = df)
    
    plt.title(f'{feature} by Diagnosis')
    plt.xlabel('Diagnosis (0 = Benign, 1 = Malignant)')
    plt.ylabel(feature)
    plt.savefig(f'{feature}_box.png')
    plt.show()