In [163]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

folder = 'data'
file_data = folder+'/breast-cancer-wisconsin.data'
file_names =  folder+'/breast-cancer-wisconsin.names'

# Check if foder exists
if not os.path.exists(folder):
    os.makedirs(folder)
    
# Check if the file exists
if not os.path.exists(file_data):
    # If the file doesn't exist, download it
    ! wget -P ./data https://raw.githubusercontent.com/01-edu/public/master/subjects/ai/classification/data/breast-cancer-wisconsin.data
else:
    print(f"{file_data} already exists.")

# Check if the file exists
if not os.path.exists(file_names):
    # If the file doesn't exist, download it
    ! wget -P ./data https://raw.githubusercontent.com/01-edu/public/master/subjects/ai/classification/data/breast-cancer-wisconsin.names
else:
    print(f"{file_names} already exists.")

data/breast-cancer-wisconsin.data already exists.
data/breast-cancer-wisconsin.names already exists.


In [164]:
# Reading the content of the .names file to understand the dataset metadata
with open(file_names, "r") as file:
    names_content = file.read()

# Displaying some initial lines of the file for better understanding
print(names_content[:500])  # Displaying the first 500 characters

Citation Request:
   This breast cancer databases was obtained from the University of Wisconsin
   Hospitals, Madison from Dr. William H. Wolberg.  If you publish results
   when using this database, then please include this information in your
   acknowledgements.  Also, please cite one or more of:

   1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear 
      programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18.

   2. William H. Wolberg and O.L. Mangasarian


In [165]:
# Define the column names based on the dataset description
column_names = [
    "Sample code number", "Clump Thickness", "Uniformity of Cell Size",
    "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size",
    "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"
]

# Read the data into a DataFrame
df = pd.read_csv('./data/breast-cancer-wisconsin.data', header=None, names=column_names)

# Show the first few rows of the DataFrame to understand its structure
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [166]:
df.describe()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [167]:
df.dtypes

Sample code number              int64
Clump Thickness                 int64
Uniformity of Cell Size         int64
Uniformity of Cell Shape        int64
Marginal Adhesion               int64
Single Epithelial Cell Size     int64
Bare Nuclei                    object
Bland Chromatin                 int64
Normal Nucleoli                 int64
Mitoses                         int64
Class                           int64
dtype: object

In [168]:
print("Missing values: ",df.isna().sum().sum() )


Missing values:  0


In [169]:
# Check for unique values in the "Bare Nuclei" column to identify non-numeric entries
print(df['Bare Nuclei'].unique())

['1' '10' '2' '4' '3' '9' '7' '?' '5' '8' '6']


In [170]:
# Convert the "Bare Nuclei" column to numeric, setting errors='coerce' to replace non-convertible values with NaN
df['Bare Nuclei'] = pd.to_numeric(df['Bare Nuclei'], errors='coerce')

# Calculate the median of the column
median_value = df['Bare Nuclei'].median()

# Replace NaN values with the median
df['Bare Nuclei'].fillna(median_value, inplace=True)

# Check if all values are now numeric and NaNs are replaced
df['Bare Nuclei'].unique()

array([ 1., 10.,  2.,  4.,  3.,  9.,  7.,  5.,  8.,  6.])

In [171]:
# Set 'Sample code number' as the index of the DataFrame
df.set_index('Sample code number', inplace=True)

# Show the first few rows of the updated DataFrame
df.head()

Unnamed: 0_level_0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
Sample code number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1.0,3,1,1,2
1002945,5,4,4,5,7,10.0,3,2,1,2
1015425,3,1,1,1,2,2.0,3,1,1,2
1016277,6,8,8,1,3,4.0,3,7,1,2
1017023,4,1,1,3,2,1.0,3,1,1,2


In [172]:
# Calculate the proportion of 'Benign' samples in the dataset
benign_proportion = df[df['Class'] == 2].shape[0] / df.shape[0]

# Calculate the baseline accuracy if the model were to always predict 'Benign'
baseline_accuracy = benign_proportion

print("Question 1")
benign_proportion

Question 1


0.6552217453505007

In [173]:
# Features (X) and target (y) variables
X = df.drop('Class', axis=1)
y = df['Class']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43, stratify=y)

# Check the shape of the resulting sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((559, 9), (140, 9), (559,), (140,))

In [174]:
# Initialize the Logistic Regression model
lr = LogisticRegression(random_state=43)

# Fit the model on the training set
lr.fit(X_train, y_train)

# Predictions on the training and test set
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

# Calculate the proportion of 'Benign' samples in the training and test sets
benign_proportion_train = y_train[y_train == 2].shape[0] / y_train.shape[0]
benign_proportion_test = y_test[y_test == 2].shape[0] / y_test.shape[0]

print("Question 2")
print("test: ", benign_proportion_test, "\ntrain: ", benign_proportion_train)

Question 2
test:  0.6571428571428571 
train:  0.6547406082289803


In [175]:
print("Question 3\n\n")
print("# Train")
print("Class prediction on train set:\n", y_train_pred[:10],"\n")
print("Probability prediction on train set:\n", lr.predict_proba(X_train)[:10, 1],"\n")
print("Score on train set:\n",train_accuracy,"\n")
print(" #Test\n")
print("Class prediction on test set:\n", y_test_pred[:10],"\n")
print("Probability prediction on test set:\n", lr.predict_proba(X_test)[:10, 1],"\n")
print("Score on test set:\n", test_accuracy)

Question 3


# Train
Class prediction on train set:
 [4 2 4 2 2 2 2 4 2 2] 

Probability prediction on train set:
 [0.99600415 0.00908666 0.99992744 0.00528803 0.02097154 0.00582772
 0.03565076 0.99515326 0.00788281 0.01065484] 

Score on train set:
 0.9695885509838998 

 #Test

Class prediction on test set:
 [2 2 2 4 2 4 2 2 2 4] 

Probability prediction on test set:
 [0.01747203 0.22495309 0.00698756 0.54020801 0.0015289  0.99862249
 0.33607994 0.01227679 0.00438157 0.99972344] 

Score on test set:
 0.9642857142857143


In [176]:
# Compute confusion matrices for the training and test sets
cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

In [177]:

print("Question 4\n")
print("Confusion Matrix training Set:")
print(cm_train)

print("\nConfusion Matrix test Set:")
print(cm_test)

Question 4

Confusion Matrix training Set:
[[357   9]
 [  8 185]]

Confusion Matrix test Set:
[[90  2]
 [ 3 45]]
