# Predicting the survivability rate of breast cancer patients


Here we import the neccesary libraries for our data preprocessing.

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV


Load the data into the data frame

In [2]:

data_load = pd.read_csv("Breast_Cancer_dataset.csv")
data_pre = pd.DataFrame(data_load)
data_pre.shape

(4024, 16)

# Data Preprocessing
## Check for missing values

In [3]:
# Check for missing values coded as NA or -1 or empty string
missing_values = data_pre.isin(['NA', -1, ''])
if missing_values.any().any():
    print("Missing values detected:")
    print(data_pre[missing_values.any(axis=1)])
else:
    print("No missing values detected.")

No missing values detected.


## Encode variables

In [4]:
Martial_mapping = {'Married':1, 'Divorced':2,'Single ':3, 'Widowed':4, 'Separated':5}
Race_mapping = {'White':1, 'Black':2,'Other':3}
T_stage_mapping = {'T1':1, 'T2':2,'T3':3, 'T4':4}
N_stage_mapping = {'N1':1, 'N2':2,'N3':3, 'N4':4}
sixth_stage_mapping = {'IA':1, 'IB':2,'IIA':3, 'IIB':4,'IIIA':5,'IIIB':6, 'IIIC':7}
differentiate_mapping = {'Undifferentiated': 0,'Poorly differentiated':1, 'Moderately differentiated':2,'Well differentiated':3}
grade_mapping = {'1':1, '2':2, '3':3, ' anaplastic; Grade IV': 4}
stage_mapping = {'Distant':0, 'Regional':1}
pos_neg_mapping = {'Positive':0, 'Negative':1}
status_mapping = {'Dead':0, 'Alive':1}

# Replace values in column X based on the mappings
data_pre['Race'] = data_pre['Race'].map(Race_mapping)
data_pre['Marital Status'] = data_pre['Marital Status'].map(Martial_mapping)
data_pre['T Stage'] = data_pre['T Stage'].map(T_stage_mapping)
data_pre['N Stage'] = data_pre['N Stage'].map(N_stage_mapping)
data_pre['6th Stage'] = data_pre['6th Stage'].map(sixth_stage_mapping)
data_pre['differentiate'] = data_pre['differentiate'].map(differentiate_mapping)
data_pre['Grade'] = data_pre['Grade'].map(grade_mapping)
data_pre['A Stage'] = data_pre['A Stage'].map(stage_mapping)
data_pre['Estrogen Status'] = data_pre['Estrogen Status'].map(pos_neg_mapping)
data_pre['Progesterone Status'] = data_pre['Progesterone Status'].map(pos_neg_mapping)
data_pre['Status'] = data_pre['Status'].map(status_mapping)

data_pre


Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,1,1,1,1,3,1,3,1,4,0,0,24,1,60,1
1,50,1,1,2,2,5,2,2,1,35,0,0,14,5,62,1
2,58,1,2,3,3,7,2,2,1,63,0,0,14,7,75,1
3,58,1,1,1,1,3,1,3,1,18,0,0,2,1,84,1
4,47,1,1,2,1,4,1,3,1,41,0,0,3,1,50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,62,3,1,1,1,3,2,2,1,9,0,0,1,1,49,1
4020,56,1,2,2,2,5,2,2,1,46,0,0,14,8,69,1
4021,68,1,1,2,1,4,2,2,1,22,0,1,11,3,69,1
4022,58,2,2,2,1,4,2,2,1,44,0,0,11,1,72,1


## Detect Outliers
Detect Outliers by using Euclidean Distance and K Nearest Neighbours, where k = 10, and threshold=15

In [5]:
# Calculate distances using Euclidean distance
def calculate_distances(data):
    return np.sum(data ** 2, axis=1)

# Fit KNN model
def fit_knn(data, k=5):
    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(data)
    return knn

# Identify outliers based on KNN
def identify_outliers_knn(data, k=10, threshold=10):
    knn = fit_knn(data, k)
    distances, _ = knn.kneighbors(data)
    avg_distances = np.mean(distances, axis=1)
    median_distance = np.median(avg_distances)
    mad = np.median(np.abs(avg_distances - median_distance))
    threshold_distance = median_distance + threshold * mad
    outliers_indices = np.where(avg_distances > threshold_distance)[0]
    return outliers_indices


numerical_data = data_pre.select_dtypes(include=[np.number]).values
outliers_indices = identify_outliers_knn(numerical_data, k=10, threshold=15)



# print(data_pre.iloc[outliers_indices])
num_outliers = len(outliers_indices)
print("Number of outliers identified:", num_outliers)

# Remove outliers from DataFrame
data_pre = data_pre.drop(outliers_indices)
data_pre.reset_index(drop=True, inplace=True)
data_pre

Number of outliers identified: 4


Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,1,1,1,1,3,1,3,1,4,0,0,24,1,60,1
1,50,1,1,2,2,5,2,2,1,35,0,0,14,5,62,1
2,58,1,2,3,3,7,2,2,1,63,0,0,14,7,75,1
3,58,1,1,1,1,3,1,3,1,18,0,0,2,1,84,1
4,47,1,1,2,1,4,1,3,1,41,0,0,3,1,50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4015,62,3,1,1,1,3,2,2,1,9,0,0,1,1,49,1
4016,56,1,2,2,2,5,2,2,1,46,0,0,14,8,69,1
4017,68,1,1,2,1,4,2,2,1,22,0,1,11,3,69,1
4018,58,2,2,2,1,4,2,2,1,44,0,0,11,1,72,1


 ## Normalize
Next we will normalize our data using min,max normalization  $X_n = \frac{x-min_x}{max_x-min_x}$

In [6]:
norm_data = (data_pre-data_pre.min())/(data_pre.max()-data_pre.min())
norm_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4020 entries, 0 to 4019
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     4020 non-null   float64
 1   Race                    4020 non-null   float64
 2   Marital Status          4020 non-null   float64
 3   T Stage                 4020 non-null   float64
 4   N Stage                 4020 non-null   float64
 5   6th Stage               4020 non-null   float64
 6   differentiate           4020 non-null   float64
 7   Grade                   4020 non-null   float64
 8   A Stage                 4020 non-null   float64
 9   Tumor Size              4020 non-null   float64
 10  Estrogen Status         4020 non-null   float64
 11  Progesterone Status     4020 non-null   float64
 12  Regional Node Examined  4020 non-null   float64
 13  Reginol Node Positive   4020 non-null   float64
 14  Survival Months         4020 non-null   

# Dimension Reduction

## Pearson Correlation -> Feature Selection 

In [7]:
def highly_correlated_features(df, threshold):
    correlation_matrix = df.corr() # Calculate the correlation matrix
    
    features_to_drop = set()
    highly_correlated_pairs = []
    
    for i in range(correlation_matrix.shape[0]):
        for j in range(i+1, correlation_matrix.shape[1]):
            # If the absolute value of the correlation coefficient is above the threshold, add the feature to the set
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                features_to_drop.add(correlation_matrix.columns[j])
                highly_correlated_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j]))

    return features_to_drop , highly_correlated_pairs


In [8]:
drop_features, correlated_pairs  = highly_correlated_features(norm_data, 0.8)


print("We can drop these Features:",drop_features)
print()
print("Correlated Pairs and their coefficient")
for i in correlated_pairs:
    print(i)

We can drop these Features: {'Grade', '6th Stage', 'Tumor Size', 'Reginol Node Positive'}

Correlated Pairs and their coefficient
('T Stage', 'Tumor Size', 0.8103496182839505)
('N Stage', '6th Stage', 0.8816250425418869)
('N Stage', 'Reginol Node Positive', 0.8409374651823236)
('differentiate', 'Grade', -1.0)


In [9]:
filtered_data = norm_data.drop(columns=drop_features)
filtered_data

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,differentiate,A Stage,Estrogen Status,Progesterone Status,Regional Node Examined,Survival Months,Status
0,0.974359,0.0,0.00,0.000000,0.0,0.333333,1.0,0.0,0.0,0.383333,0.556604,1.0
1,0.512821,0.0,0.00,0.333333,0.5,0.666667,1.0,0.0,0.0,0.216667,0.575472,1.0
2,0.717949,0.0,0.25,0.666667,1.0,0.666667,1.0,0.0,0.0,0.216667,0.698113,1.0
3,0.717949,0.0,0.00,0.000000,0.0,0.333333,1.0,0.0,0.0,0.016667,0.783019,1.0
4,0.435897,0.0,0.00,0.333333,0.0,0.333333,1.0,0.0,0.0,0.033333,0.462264,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4015,0.820513,1.0,0.00,0.000000,0.0,0.666667,1.0,0.0,0.0,0.000000,0.452830,1.0
4016,0.666667,0.0,0.25,0.333333,0.5,0.666667,1.0,0.0,0.0,0.216667,0.641509,1.0
4017,0.974359,0.0,0.00,0.333333,0.0,0.666667,1.0,0.0,1.0,0.166667,0.641509,1.0
4018,0.717949,0.5,0.25,0.333333,0.0,0.666667,1.0,0.0,0.0,0.166667,0.669811,1.0


## Sequential Forward Selection -> Feature Ranking + Selection

In [10]:
X = filtered_data.iloc[:, 0:len(filtered_data.columns)-1]
y = filtered_data.Status # 'Status' is the target column we are trying to predict
print(y.shape, X.shape)

(4020,) (4020, 11)


In [11]:
estimator = LinearRegression()
feature_names = list(X.columns)

# Initialize Sequential Forward Selector
sfs = SequentialFeatureSelector(estimator, k_features=8, forward=True, scoring='neg_mean_squared_error', cv=5)
sfs.fit(X, y)

# Get the selected feature names
selected_feature_indices = sfs.k_feature_idx_
selected_feature_names = [feature_names[i] for i in selected_feature_indices]
print(f"Selected features: {selected_feature_names}")

# Drop low ranking features
X = X[selected_feature_names]
X

Selected features: ['Age', 'T Stage', 'N Stage', 'differentiate', 'Estrogen Status', 'Progesterone Status', 'Regional Node Examined', 'Survival Months']


Unnamed: 0,Age,T Stage,N Stage,differentiate,Estrogen Status,Progesterone Status,Regional Node Examined,Survival Months
0,0.974359,0.000000,0.0,0.333333,0.0,0.0,0.383333,0.556604
1,0.512821,0.333333,0.5,0.666667,0.0,0.0,0.216667,0.575472
2,0.717949,0.666667,1.0,0.666667,0.0,0.0,0.216667,0.698113
3,0.717949,0.000000,0.0,0.333333,0.0,0.0,0.016667,0.783019
4,0.435897,0.333333,0.0,0.333333,0.0,0.0,0.033333,0.462264
...,...,...,...,...,...,...,...,...
4015,0.820513,0.000000,0.0,0.666667,0.0,0.0,0.000000,0.452830
4016,0.666667,0.333333,0.5,0.666667,0.0,0.0,0.216667,0.641509
4017,0.974359,0.333333,0.0,0.666667,0.0,1.0,0.166667,0.641509
4018,0.717949,0.333333,0.0,0.666667,0.0,0.0,0.166667,0.669811


# Predictive Modelling
## 1. KNN 

In [12]:
# Implement the KNN function from scratch
def knn(X_train, y_train, X_test, k):
    # Calculate the Euclidean distance between the test point and all training points
    distances = np.sqrt(np.sum((X_train - X_test) ** 2, axis=1))
    
    # Sort the distances and return the indices of the first k neighbors
    k_neighbors = np.argsort(distances)[:k]
    
    # Extract the labels of the k nearest neighbor training samples
    k_nearest_labels = y_train[k_neighbors]
    
    # Return the most common class label
    return np.bincount(k_nearest_labels).argmax()

In [13]:
y = y.astype(int) # Convert the labels in the target column 'Status' to integers
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
k = 10
y_pred = np.array([knn(X_train.values, y_train.values, x, k) for x in X_test.values]) # Predict the class labels for the test set using the KNN function

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:",recall_score(y_test, y_pred))
print("F1 Score:",f1_score(y_test, y_pred))

Accuracy: 0.8980099502487562
Precision: 0.9154160982264665
Recall: 0.9710564399421129
F1 Score: 0.9424157303370786


1. KNN (K-Nearest Neighbors):
   - Summary: KNN is a simple, instance-based learning algorithm that classifies a data point based on the majority class of its nearest neighbors.
   - Pros: Easy to understand and implement, no training phase, works well with small datasets and non-linear relationships.
   - Cons: Computationally expensive during testing phase, sensitive to irrelevant features and the choice of distance metric.
   - Main Hyperparameters: 
     - `k`: Number of nearest neighbors to consider.
     - Distance metric (e.g., Euclidean distance, Manhattan distance).

## 2. Naïve Bayes

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:",recall_score(y_test, y_pred))
print("F1 Score:",f1_score(y_test, y_pred))

Accuracy: 0.8482587064676617
Precision: 0.9028571428571428
Recall: 0.9212827988338192
F1 Score: 0.9119769119769119



2. Naïve Bayes:
   - Summary: Naïve Bayes is a probabilistic classifier based on Bayes' theorem with the assumption of independence between features.
   - Pros: Fast training and prediction, performs well with small datasets and high-dimensional feature spaces, handles missing values well.
   - Cons: Strong independence assumption may lead to suboptimal performance in some cases, especially when features are correlated.
   - Main Hyperparameters: None (though some variants may have smoothing parameters).

## 3. C4.5 Decision Tree 

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:",recall_score(y_test, y_pred))
print("F1 Score:",f1_score(y_test, y_pred))

Accuracy: 0.8445273631840796
Precision: 0.9118942731277533
Recall: 0.9052478134110787
F1 Score: 0.9085588880760789



3. C4.5 Decision Tree:
   - Summary: C4.5 is a decision tree algorithm that recursively splits the data based on the feature that provides the most information gain.
   - Pros: Easy to interpret and visualize, handles both numerical and categorical data, automatically handles feature selection.
   - Cons: Prone to overfitting, sensitive to noisy data and outliers, may create biased trees for imbalanced datasets.
   - Main Hyperparameters: 
     - Maximum tree depth.
     - Minimum number of samples required to split a node.
     - Minimum impurity decrease required for a split.

## 4. Random Forest 

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:",recall_score(y_test, y_pred))
print("F1 Score:",f1_score(y_test, y_pred))

Accuracy: 0.8967661691542289
Precision: 0.9124487004103967
Recall: 0.9723032069970845
F1 Score: 0.9414255469301341



4. Random Forest:
   - Summary: Random Forest is an ensemble learning method that constructs multiple decision trees and combines their predictions through averaging or voting.
   - Pros: Reduces overfitting compared to individual decision trees, handles high-dimensional data well, robust to noisy data.
   - Cons: More complex than single decision trees, longer training time and higher memory usage, less interpretable.
   - Main Hyperparameters: 
     - Number of trees in the forest.
     - Maximum tree depth.
     - Number of features to consider at each split.
     - Minimum number of samples required to split a node.

## 5.Gradient Boosting

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:",recall_score(y_test, y_pred))
print("F1 Score:",f1_score(y_test, y_pred))

Accuracy: 0.8955223880597015
Precision: 0.9123287671232877
Recall: 0.9708454810495627
F1 Score: 0.940677966101695



5. Gradient Boosting:
   - Summary: Gradient Boosting builds an ensemble of weak learners (typically decision trees) sequentially, where each new model corrects errors made by the previous ones.
   - Pros: Often produces highly accurate models, handles both numerical and categorical data, less prone to overfitting compared to Random Forest.
   - Cons: More sensitive to hyperparameters and prone to overfitting with large datasets, longer training time, and higher computational cost.
   - Main Hyperparameters: 
     - Learning rate (shrinkage parameter).
     - Number of trees (boosting iterations).
     - Maximum tree depth.
     - Minimum number of samples required to split a node.

# Hyperparameter Tuning 

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6]
}

rf = RandomForestClassifier()
rf_cv = GridSearchCV(rf, param_grid, cv=5)
rf_cv.fit(X_train, y_train)

print("Best parameters:", rf_cv.best_params_)
print("Accuracy:", rf_cv.best_score_)
print("Precision:", precision_score(y_test, rf_cv.predict(X_test)))
print("Recall:", recall_score(y_test, rf_cv.predict(X_test)))
print("F1 Score:", f1_score(y_test, rf_cv.predict(X_test)))

Best parameters: {'max_depth': 7, 'min_samples_split': 6, 'n_estimators': 100}
Accuracy: 0.9082715918201754
Precision: 0.9118046132971506
Recall: 0.9795918367346939
F1 Score: 0.9444834855938158


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6],
    'learning_rate':  [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
}

gb = GradientBoostingClassifier()
gb_cv = GridSearchCV(gb, param_grid, cv=5)
gb_cv.fit(X_train, y_train)

print("Best parameters:", gb_cv.best_params_)
print("Accuracy:", gb_cv.best_score_)
print("Precision:", precision_score(y_test, gb_cv.predict(X_test)))
print("Recall:", recall_score(y_test, gb_cv.predict(X_test)))
print("F1 Score:", f1_score(y_test, gb_cv.predict(X_test)))


Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 0.9067178308202042
Precision: 0.9139344262295082
Recall: 0.9752186588921283
F1 Score: 0.9435825105782794



## Analysis
| **Classifier** | **Hyperparameters** | **Accuracy** | **Precision** | **Recall** | **F1 Score** |
|----------------|---------------------|--------------|---------------|------------|--------------|
| Random Forest  | n_estimators=100, max_depth=7, min_samples_split=6 | 0.9073 | 0.9107 | 0.9810 | 0.9445 |
| Gradient Boosting | learning_rate: 0.1, n_estimators=50, max_depth=3, min_samples_split=2 | 0.9067 | 0.9139 | 0.9752 | 0.9436 |


Both models are fairly good performance:
- Random Forest Classifier has a slightly higher recall and F1 score
- Gradient Boosting Classifier has a slightly higher precision

## Conclusion
- In this project, we analyzed the Breast Cancer dataset and built machine learning models to predict the survival status of patients based on various features.
- We preprocessed the data by handling missing values, encoding categorical variables, identifying and removing outliers, and selecting relevant features.
- We implemented the KNN algorithm from scratch and evaluated the performance of the models using accuracy, precision, recall, and F1 score metrics.
- We compared the performance of different classifiers, including Linear Regression, Naive Bayes, Decision Tree, Random Forest, and Gradient Boosting, using hyperparameter tuning with GridSearchCV.
- Random Forest and Gradient Boosting classifiers achieved the highest recall, and F1 score on the test set, indicating that they are suitable for predicting the survival status of breast cancer patients.

References:

- Predictive Analytics for Dummies By Professor Annase Bari