## **Brain Cancer Dataset**

In [1]:
#include libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#load data and checking contents
data = pd.read_csv("DT-BrainCancer.csv")
print(data)

#check for null values
print("\nChecking NULL values:\n",data.isnull().sum())

    Unnamed: 0     sex   diagnosis             loc  ki    gtv  status
0            1  Female  Meningioma  Infratentorial  90   6.11       0
1            2    Male   HG glioma  Supratentorial  90  19.35       1
2            3  Female  Meningioma  Infratentorial  70   7.95       0
3            4  Female   LG glioma  Supratentorial  80   7.61       1
4            5    Male   HG glioma  Supratentorial  90   5.06       1
..         ...     ...         ...             ...  ..    ...     ...
83          84    Male   HG glioma  Supratentorial  80   0.16       1
84          85    Male   HG glioma  Supratentorial  80  19.81       1
85          86    Male  Meningioma  Supratentorial  90   2.50       0
86          87    Male  Meningioma  Supratentorial  90   2.02       0
87          88    Male       Other  Infratentorial  80   0.11       0

[88 rows x 7 columns]

Checking NULL values:
 Unnamed: 0    0
sex           0
diagnosis     1
loc           0
ki            0
gtv           0
status        0
d

In [2]:
#preprocessing data
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)
print("\n")

data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

print(data)

Categorical Columns: ['sex', 'diagnosis', 'loc']


    Unnamed: 0  ki    gtv  status  sex_Male  diagnosis_LG glioma  \
0            1  90   6.11       0     False                False   
1            2  90  19.35       1      True                False   
2            3  70   7.95       0     False                False   
3            4  80   7.61       1     False                 True   
4            5  90   5.06       1      True                False   
..         ...  ..    ...     ...       ...                  ...   
83          84  80   0.16       1      True                False   
84          85  80  19.81       1      True                False   
85          86  90   2.50       0      True                False   
86          87  90   2.02       0      True                False   
87          88  80   0.11       0      True                False   

    diagnosis_Meningioma  diagnosis_Other  loc_Supratentorial  
0                   True            False               False  
1   

In [3]:
#selecting target variable and features
X = data.drop('status', axis = 1)
y = data['status']

#splitting data for training, validating and testing
X_train = X.iloc[:61]
X_validation = X.iloc[61:74]
X_test = X.iloc[74:]

### **Evaluation Metrics**

In [None]:
#Accuracy
def calc_accuracy(y_true, y_pred):
  correct = 0
  total = len(y_true)

  for true_label, predicted_label in zip(y_true, y_pred):
    if true_label == predicted_label:
      correct += 1

  accuracy = correct / total
  return accuracy

#Confusion Matrix
def confusion_matrix(y_true, y_pred):
  TN, TP, FN, FP = 0, 0, 0, 0

  for true_label, predicted_label in zip(y_true, y_pred):
    if true_label == 0 and predicted_label == 0:
      TN += 1
    elif true_label == 0 and predicted_label == 1:
      FP += 1
    elif true_label == 1 and predicted_label == 0:
      FN += 1
    elif true_label == 1 and predicted_label == 1:
      TP += 1

  return (TN, TP, FN, FP)

#Average Precision
def avg_precision(y_true, y_pred):
  data = list(zip(y_true, y_pred))
  data.sort(key=lambda x: x[1], reverse=True)

  # Initialize variables
  num_positives = sum(y_true)
  num_examples = len(y_true)
  true_positives = 0
  precision_sum = 0
  recall_sum = 0

  # Calculate precision and recall at each threshold
  for i in range(num_examples):
    if data[i][0] == 1:
      true_positives += 1
      precision = true_positives / (i + 1)
      recall = true_positives / num_positives
      precision_sum += precision
      recall_sum += recall

  # Calculate Average Precision (AP) using the precision-recall curve
  if num_positives == 0:
    average_precision = 0
  else:
    average_precision = precision_sum / num_positives

  return average_precision


#Average Recall
def avg_recall(y_true, y_pred):
  total_positives = sum(y_true)  # Total number of positive samples
  thresholds = sorted(set(y_pred), reverse=True)  # Unique sorted thresholds

  recall_values = []
  for threshold in thresholds:
    y_pred_thresholded = [1 if pred >= threshold else 0 for pred in y_pred]
    true_positives = sum([1 for true, pred in zip(y_true, y_pred_thresholded) if true == 1 and pred == 1])
    recall = true_positives / total_positives
    recall_values.append(recall)

  average_recall = sum(recall_values) / len(recall_values)

  return average_recall

#Average F1-Score
def avg_f1(y_true, y_pred):
  total_positives = sum(y_true)  # Total number of positive samples
  thresholds = sorted(set(y_pred), reverse=True)  # Unique sorted thresholds

  f1_scores = []
  for threshold in thresholds:
    y_pred_thresholded = [1 if pred >= threshold else 0 for pred in y_pred]

  # Calculate precision and recall
  true_positives = sum([1 for true, pred in zip(y_true, y_pred_thresholded) if true == 1 and pred == 1])
  predicted_positives = sum(y_pred_thresholded)

  if predicted_positives == 0:
    precision = 0.0
  else:
    precision = true_positives / predicted_positives

  recall = true_positives / total_positives

  # Calculate F1-score
  if precision + recall == 0:
    f1 = 0.0
  else:
    f1 = 2 * (precision * recall) / (precision + recall)

  f1_scores.append(f1)

  average_f1_score = sum(f1_scores) / len(f1_scores)

  return average_f1_score