In [3]:
import pandas as pd
import numpy as np
import sklearn.model_selection as sms
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [5]:
# Read the data from the CSV file
file_path = r'data.csv'  
raw_dataset = pd.read_csv(file_path)
# Print the data types of each column in the dataset; all should be of float type
print (raw_dataset.dtypes)
# Check for missing values in each column
print(raw_dataset.isnull().sum())

id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst     

In [6]:
# drop noninformative column (null column and id column)
raw_dataset = raw_dataset.drop(['Unnamed: 32','id'], axis=1) #this column is NaN 

# Print the entire dataset, including all rows and columns
raw_dataset

# Converting qualitative labels to numerical ones 
def label_to_numeric(column):
    if column.dtype == 'object':
        unique_labels, _ = pd.factorize(column)
        return pd.Series(unique_labels, index=column.index)
    
    return column

dataset = raw_dataset.apply(label_to_numeric)
# Understanding which number is attributed to which label
label_map = dict(zip(raw_dataset['diagnosis'], dataset['diagnosis']))
label_map


{'M': 0, 'B': 1}

In [12]:
features = list(set(dataset.columns) - {'diagnosis'}) # Feature label extraction
label = dataset['diagnosis']
data = dataset[features]

In [13]:
data_trainset, data_testset, label_trainset, label_testset = sms.train_test_split(data, label, test_size=0.2)

#Training Guassian naive baise
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(data_trainset, label_trainset)
prediction = naive_bayes_model.predict(data_testset)


In [43]:
#parameters
accuracy = accuracy_score(label_testset, prediction)
print("Accuracy:" ,accuracy, '\n\n')

Comparison = pd.concat([label_testset.reset_index(drop=True) ,pd.Series(prediction)], axis=1)
Comparison.rename(columns={0: 'classifier'}, inplace=True)
Comparison

# postitive = 'M' , negative = 'B'
M_TP = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 0)).sum()
M_FN = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 1)).sum()
M_TN = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 1)).sum()
M_FP = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 0)).sum()

M_confusion_matarix = confusion_matrix(label_testset, prediction)

M_confusion_matarix = pd.DataFrame(M_confusion_matarix, columns=['Actual Positive', 'Actual Negative'], index=['Predicted Positive', 'Predicted Negative'])

# postitive = 'B' , negative = 'M'
B_TP = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 1)).sum()
B_FN = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 0)).sum()
B_TN = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 0)).sum()
B_FP = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 1)).sum()

B_confusion_matarix = {
    'Actual Positive': [B_TP, B_FN],
    'Actual Negative': [B_FP, B_TN],
}


B_confusion_matarix = pd.DataFrame(B_confusion_matarix, columns=['Actual Positive', 'Actual Negative'], index=['Predicted Positive', 'Predicted Negative'])

print('Category M\n' ,M_confusion_matarix,'\n\n','Category B\n', B_confusion_matarix, '\n\n')

M_Precision = M_TP/(M_TP+M_FP)
M_Recall = M_TP/(M_TP+M_FN)

print('Category M' ,'Precision =', M_Precision, 'Recall =', M_Recall)

B_Precision = B_TP/(B_TP+B_FP)
B_Recall = B_TP/(B_TP+B_FN)

print('Category B' ,'Precision =', B_Precision, 'Recall =', B_Recall)


Accuracy: 0.9298245614035088 


Category M
                     Actual Positive  Actual Negative
Predicted Positive               36                5
Predicted Negative                3               70 

 Category B
                     Actual Positive  Actual Negative
Predicted Positive               70                5
Predicted Negative                3               36 


Category M Precision = 0.9230769230769231 Recall = 0.8780487804878049
Category B Precision = 0.9333333333333333 Recall = 0.958904109589041
