# Step 1: Data Cleaning and Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the hepatitis data
df = pd.read_csv('Breast_Cancer_dataset.csv')

df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68.0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24.0,1,60,Alive
1,50.0,White,,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14.0,5,62,Alive
2,58.0,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14.0,7,75,Alive
3,58.0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,,Positive,Positive,2.0,1,84,Alive
4,47.0,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,,Positive,3.0,1,50,Alive


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     3823 non-null   float64
 1   Race                    3622 non-null   object 
 2   Marital Status          3703 non-null   object 
 3   T Stage                 4024 non-null   object 
 4   N Stage                 4024 non-null   object 
 5   6th Stage               4024 non-null   object 
 6   differentiate           4024 non-null   object 
 7   Grade                   4024 non-null   object 
 8   A Stage                 4024 non-null   object 
 9   Tumor Size              3622 non-null   float64
 10  Estrogen Status         3823 non-null   object 
 11  Progesterone Status     4024 non-null   object 
 12  Regional Node Examined  3421 non-null   float64
 13  Reginol Node Positive   4024 non-null   int64  
 14  Survival Months         4024 non-null   

In [3]:
print(df.isnull().sum())

Age                       201
Race                      402
Marital Status            321
T Stage                     0
N Stage                     0
6th Stage                   0
differentiate               0
Grade                       0
A Stage                     0
Tumor Size                402
Estrogen Status           201
Progesterone Status         0
Regional Node Examined    603
Reginol Node Positive       0
Survival Months             0
Status                      0
dtype: int64


In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor


# Separate numeric and categorical columns with missing values
numeric_columns = ['Age', 'Tumor Size', 'Regional Node Examined']
categorical_columns = ['Race', 'Marital Status', 'Estrogen Status']

# MICE for numeric variables
mice_imputer = IterativeImputer(estimator=RandomForestRegressor(), n_nearest_features=None, imputation_order='ascending')
df[numeric_columns] = mice_imputer.fit_transform(df[numeric_columns])

# KNN Imputation for categorical variables
knn_imputer = KNNImputer(n_neighbors=5)
ordinal_encoder = OrdinalEncoder()

# Encode categorical variables
df_encoded = df.copy()
df_encoded[categorical_columns] = ordinal_encoder.fit_transform(df[categorical_columns])

# Perform KNN imputation on encoded categorical variables
df_encoded[categorical_columns] = knn_imputer.fit_transform(df_encoded[categorical_columns])

# Decode back to original categories
df[categorical_columns] = ordinal_encoder.inverse_transform(df_encoded[categorical_columns])

# Verify imputation
print(df.isnull().sum())



Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64


In [5]:
import pandas as pd
import numpy as np
from scipy import stats


# Select numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Function to detect outliers using Z-score method
def detect_outliers_zscore(data, threshold=3):
    z_scores = np.abs(stats.zscore(data))
    return z_scores > threshold

# Detect outliers for each numeric column
outliers_zscore = {}

for column in numeric_columns:
    outliers_zscore[column] = detect_outliers_zscore(df[column])

# Print summary of outliers
print("Outlier Detection Summary (Z-score method):")
for column in numeric_columns:
    num_outliers = outliers_zscore[column].sum()
    total_values = len(df[column])
    outlier_percentage = (num_outliers / total_values) * 100
    print(f"{column}: {num_outliers} outliers detected ({outlier_percentage:.2f}% of values)")


Outlier Detection Summary (Z-score method):
Age: 0 outliers detected (0.00% of values)
Tumor Size: 76 outliers detected (1.89% of values)
Regional Node Examined: 36 outliers detected (0.89% of values)
Reginol Node Positive: 101 outliers detected (2.51% of values)
Survival Months: 4 outliers detected (0.10% of values)


In [6]:
upper_limit = df['Tumor Size'].quantile(0.99)
df['Tumor Size'] = df['Tumor Size'].clip(upper=upper_limit)

In [7]:
upper_limit = df['Reginol Node Positive'].quantile(0.99)
df['Reginol Node Positive'] = df['Reginol Node Positive'].clip(upper=upper_limit)

In [8]:
# Assuming 'Status' is your target variable
target = df['Status']
features = df.drop('Status', axis=1)

# Step 2: Feature Engineering

In [9]:
#Standardization & One Hot Encoding
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Define numeric and categorical columns
numeric_columns = ['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months']
categorical_columns = ['Race', 'Marital Status', 'T Stage', 'N Stage', '6th Stage', 'differentiate', 'Grade', 'A Stage', 'Estrogen Status', 'Progesterone Status']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_columns)
    ])

# Fit and transform the data
X = df.drop('Status', axis=1)  # Assuming 'Status' is your target variable
transformed_data = preprocessor.fit_transform(X)

# Get feature names after transformation
numeric_features = numeric_columns
categorical_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns)
feature_names = list(numeric_features) + list(categorical_features)

# Create a new dataframe with transformed data
transformed_df = pd.DataFrame(transformed_data, columns=feature_names)

# Add the target variable back to the dataframe
transformed_df['Status'] = df['Status']

# Display the first few rows of the transformed dataframe
print(transformed_df.head())


        Age  Tumor Size  Regional Node Examined  Reginol Node Positive  \
0  1.596543   -1.329501                1.232485              -0.636232   
1 -0.447627    0.230926               -0.056311               0.179832   
2  0.460893    1.640344               -0.056311               0.587864   
3  0.460893   -0.332763               -1.602868              -0.636232   
4 -0.788323    0.532944               -1.473988              -0.636232   

   Survival Months  Race_Black  Race_Other  Race_White  \
0        -0.492961         0.0         0.0         1.0   
1        -0.405695         0.0         0.0         1.0   
2         0.161530         0.0         0.0         1.0   
3         0.554224         0.0         0.0         1.0   
4        -0.929288         0.0         0.0         1.0   

   Marital Status_Divorced  Marital Status_Married  ...  Grade_1  Grade_2  \
0                      0.0                     1.0  ...      0.0      0.0   
1                      1.0                     0.0  

In [10]:
transformed_df.head()

Unnamed: 0,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months,Race_Black,Race_Other,Race_White,Marital Status_Divorced,Marital Status_Married,...,Grade_1,Grade_2,Grade_3,A Stage_Distant,A Stage_Regional,Estrogen Status_Negative,Estrogen Status_Positive,Progesterone Status_Negative,Progesterone Status_Positive,Status
0,1.596543,-1.329501,1.232485,-0.636232,-0.492961,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,Alive
1,-0.447627,0.230926,-0.056311,0.179832,-0.405695,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Alive
2,0.460893,1.640344,-0.056311,0.587864,0.16153,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Alive
3,0.460893,-0.332763,-1.602868,-0.636232,0.554224,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,Alive
4,-0.788323,0.532944,-1.473988,-0.636232,-0.929288,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,Alive


# Step 3: Dimensionality Reduction

In [11]:
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd


# Separate features and target
X = transformed_df.drop('Status', axis=1)
y = transformed_df['Status']

# Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_pca = pca.fit_transform(X)

# Create a new dataframe with PCA results
pca_df = pd.DataFrame(data=X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])

# Add the target variable back
pca_df['Status'] = y

# Print the number of components and explained variance ratio
print(f"Number of components: {pca.n_components_}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")



Number of components: 14
Explained variance ratio: [0.20747771 0.12004701 0.10881061 0.102901   0.08304193 0.07394465
 0.06067855 0.04407228 0.03855349 0.0349833  0.02948414 0.02252211
 0.01543837 0.00911165]


# Step 4: Model Implementation and Evaluation

# K-Nearest Neighbors (KNN)
**Summary:** KNN classifies a data point based on the majority class of its k nearest neighbors in the feature space.

**Pros:**
Simple and intuitive
No assumptions about data distribution
Works well for multi-class problems

**Cons:**
Computationally expensive for large datasets
Sensitive to irrelevant features and the scale of the data
Requires feature scaling

**Main hyperparameter:**
k: Number of nearest neighbors to consider

In [12]:
#kNN
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Separate features and target
X = pca_df.drop('Status', axis=1).values
y = pca_df['Status'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2)**2))

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = max(set(k_nearest_labels), key=k_nearest_labels.count)
        return most_common

# Function to calculate accuracy
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

# Initialize and train the KNN classifier
knn = KNN(k=5)  # You can adjust k
knn.fit(X_train, y_train)

# Make predictions
knn_y_pred = knn.predict(X_test)

# Calculate accuracy
acc = accuracy(y_test, knn_y_pred)
print(f"Accuracy: {acc:.4f}")

# Optional: Implement cross-validation
def cross_validate(X, y, k_fold=5):
    fold_size = len(X) // k_fold
    accuracies = []

    for i in range(k_fold):
        start = i * fold_size
        end = start + fold_size
        X_test = X[start:end]
        y_test = y[start:end]
        X_train = np.concatenate([X[:start], X[end:]])
        y_train = np.concatenate([y[:start], y[end:]])

        knn = KNN(k=5)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        acc = accuracy(y_test, y_pred)
        accuracies.append(acc)

    return np.mean(accuracies)

# Perform cross-validation
cv_accuracy = cross_validate(X, y)
print(f"Cross-validation accuracy: {cv_accuracy:.4f}")

Accuracy: 0.8969
Cross-validation accuracy: 0.8808


# Naive Bayes
Summary: Naive Bayes is a probabilistic classifier based on Bayes' theorem with an assumption of independence between features.

Pros:
Fast training and prediction,
Works well with high-dimensional data,
Performs well with small datasets

Cons:
Assumes feature independence (often not true in real-world scenarios),
May underperform when features are highly correlated

Main hyperparameters:
Type of Naive Bayes: Gaussian, Multinomial, or Bernoulli,
Smoothing parameter (alpha) for Laplace smoothing

In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = pca_df.drop('Status', axis=1)
y = pca_df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb = GaussianNB()
nb.fit(X_train, y_train)
nb_y_pred = nb.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, nb_y_pred):.4f}")

Accuracy: 0.8882


# C4.5 Decision Tree
Summary: C4.5 is a decision tree algorithm that uses information gain as the splitting criterion.

Pros:
Easy to interpret and visualize,
Handles both numerical and categorical data,
No need for feature scaling

Cons:
Prone to overfitting, especially with deep trees,
Can be unstable (small changes in data can lead to very different trees)

Main hyperparameters:
Maximum depth of the tree,
Minimum number of samples required to split an internal node,
Minimum number of samples required to be at a leaf node

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt.fit(X_train, y_train)
dt_y_pred = dt.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, dt_y_pred):.4f}")

Accuracy: 0.8509


# Random Forest
Summary: Random Forest is an ensemble of decision trees, where each tree is trained on a random subset of features and data.

Pros:
Generally high accuracy,
Handles high-dimensional data well,
Provides feature importance

Cons:
Less interpretable than a single decision tree,
Can be computationally intensive for large datasets

Main hyperparameters:
Number of trees in the forest,
Maximum depth of the trees,
Minimum number of samples required to split an internal node, Number of features to consider for the best split

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, rf_y_pred):.4f}")

Accuracy: 0.8957


# Gradient Boosting
Summary: Gradient Boosting builds an ensemble of weak learners (typically decision trees) in a sequential manner, each trying to correct the errors of the previous ones.

Pros:
Often achieves high accuracy,
Handles mixed data types well,
Provides feature importance

Cons:
Can overfit if not properly tuned,
Computationally intensive to train,
Less interpretable than single decision trees

Main hyperparameters:
Number of boosting stages (trees),
Learning rate,
Maximum depth of individual trees,
Minimum number of samples required to split an internal node

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)
gb_y_pred = gb.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, gb_y_pred):.4f}")

Accuracy: 0.8981


# Neural Network
Summary: Neural Networks are a series of interconnected layers of neurons that can learn complex patterns in data.

Pros:
Can learn highly complex patterns,
Works well with large amounts of data,
Can handle high-dimensional data

Cons:
Requires careful tuning of hyperparameters,
Can be computationally intensive to train,
Less interpretable than simpler models

Main hyperparameters:
Number and size of hidden layers,
Activation functions,
Learning rate,
Number of epochs,
Batch size,
Regularization parameters (e.g., dropout rate, L1/L2 regularization)

In [17]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
nn.fit(X_train, y_train)
nn_y_pred = nn.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, nn_y_pred):.4f}")

Accuracy: 0.8820


In [18]:
# Display results for all models
results = pd.DataFrame({
    "Model": ["KNN", "Naïve Bayes", "Decision Tree", "Random Forest", "Gradient Boosting", "Neural Network"],
    "Accuracy": [
        accuracy_score(y_test, knn_y_pred),
        accuracy_score(y_test, nb_y_pred),
        accuracy_score(y_test, dt_y_pred),
        accuracy_score(y_test, rf_y_pred),
        accuracy_score(y_test, gb_y_pred),
        accuracy_score(y_test, nn_y_pred)
    ]
})
print("\nModel Performance Summary:\n", results)



Model Performance Summary:
                Model  Accuracy
0                KNN  0.896894
1        Naïve Bayes  0.888199
2      Decision Tree  0.850932
3      Random Forest  0.895652
4  Gradient Boosting  0.898137
5     Neural Network  0.881988


# Step 3: Hyperparameter Tuning [15]

**Random Forest Hyperparameter Tuning:**

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define the parameter grid
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a base model
rf = RandomForestClassifier(random_state=42)

# Instantiate the grid search model
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf,
                              cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search_rf.fit(X_train, y_train)

# Get the best parameters
print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# Use the best model to make predictions
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test)

# Print the performance metrics
print("\nRandom Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters for Random Forest: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}

Random Forest Performance:
Accuracy: 0.9031055900621118

Classification Report:
              precision    recall  f1-score   support

       Alive       0.91      0.98      0.95       685
        Dead       0.83      0.44      0.58       120

    accuracy                           0.90       805
   macro avg       0.87      0.71      0.76       805
weighted avg       0.90      0.90      0.89       805



**Gradient Boosting Hyperparameter Tuning:**

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

# Define the parameter grid
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a base model
gb = GradientBoostingClassifier(random_state=42)

# Instantiate the grid search model
grid_search_gb = GridSearchCV(estimator=gb, param_grid=param_grid_gb,
                              cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search_gb.fit(X_train, y_train)

# Get the best parameters
print("Best parameters for Gradient Boosting:", grid_search_gb.best_params_)

# Use the best model to make predictions
y_pred_gb = grid_search_gb.best_estimator_.predict(X_test)

# Print the performance metrics
print("\nGradient Boosting Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}

Gradient Boosting Performance:
Accuracy: 0.9006211180124224

Classification Report:
              precision    recall  f1-score   support

       Alive       0.91      0.98      0.94       685
        Dead       0.77      0.47      0.59       120

    accuracy                           0.90       805
   macro avg       0.84      0.73      0.77       805
weighted avg       0.89      0.90      0.89       805

