This Notebook is to compare the accuracy of the dataset without any NaN values and with Nan values.

Author : Sangeetha Vijayam
Date : 14-Feb-2025

In [4]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Loading the dataset
brst_cancer = load_breast_cancer(as_frame=True)
df = brst_cancer.frame

# Separate independant variables and target
X = df.drop(columns=['target'])  # independant variables
y = df['target']  # 0 is Malignant, 1 is Benign

# Spliting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", cr)

Accuracy: 0.9473684210526315

Confusion Matrix:
 [[40  3]
 [ 3 68]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [5]:
np.random.seed(42)
# Select 100 indices randomly and set NaN in 'mean radius' column
random_indices = X.sample(n=100).index

X.loc[random_indices, 'mean radius'] = np.nan
X.loc[random_indices, 'mean texture'] = np.nan
X.loc[random_indices, 'mean perimeter'] = np.nan
X.loc[random_indices, 'mean area'] = np.nan
X.loc[random_indices, 'mean smoothness'] = np.nan
X.loc[random_indices, 'mean compactness'] = np.nan
X.loc[random_indices, 'mean concavity'] = np.nan
X.loc[random_indices, 'mean concave points'] = np.nan
X.loc[random_indices, 'mean symmetry'] = np.nan
X.loc[random_indices, 'mean fractal dimension'] = np.nan
# Check the number of NaN's
print(len(X.loc[random_indices, 'mean radius']))

100


In [6]:
# perform same set of prediction steps in the updated dependant variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy_upd = accuracy_score(y_test, y_pred)
cm_upd = confusion_matrix(y_test, y_pred)
cr_upd = classification_report(y_test, y_pred)

print("Updated Accuracy:", accuracy_upd)
print("\nUpdated Confusion Matrix:\n", cm_upd)
print("\nUpdated Classification Report:\n", cr_upd)


Updated Accuracy: 0.6929824561403509

Updated Confusion Matrix:
 [[11 32]
 [ 3 68]]

Updated Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.26      0.39        43
           1       0.68      0.96      0.80        71

    accuracy                           0.69       114
   macro avg       0.73      0.61      0.59       114
weighted avg       0.72      0.69      0.64       114



Conclusion:

There is around 25% reduction of accuracy after making the columns NaN