### Required Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc,confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

### To define the Headings of the Columns

In [2]:

Cols = ['ID', 'Target', 'Time to recurrence'] + \
        [f'x{i}' for i in range(30)] + ['dia', 'lym']
Cols_ = ', '.join(Cols)
Cols_, len(Cols)

('ID, Target, Time to recurrence, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, dia, lym',
 35)

### Converting the data file into csv for processing

In [3]:
with open('wpbc.data') as myfile:
    with open('xyz.csv', 'w') as f:
        f.write(Cols_)
        f.write('\n')
        for line in myfile.readlines():
            f.write(line)

FileNotFoundError: [Errno 2] No such file or directory: 'wpbc.data'

### reading the Data using the Pandas library

In [None]:
Data = pd.read_csv('xyz.csv')
print(Data.shape)
Data

## (a)(b)
Seperating first 130 Non-recurrent and first 37 Recurrent from the data to form Train Dataset

The Entry No #197 is also added to the Train set as required.

In [None]:
D1 = Data[Data[' Target'] == 'N']
D2 = Data[Data[' Target'] == 'R']
Train = pd.concat([D1[:130], D2[:37]]).append(Data.iloc[196])
Test = pd.concat([D1[130:], D2[37:]]).drop(196)
print(D1.shape, D2.shape, Train.shape, Test.shape)
Train.head(2)

## (c):

Imputing the datapoints where the data is not available or indicated as '?'

In [None]:
print('Before Imputation', (Train[' lym'] == '?').sum())
for col in Data.columns:
    impute = np.median(Train[col] != '?')
    Train[col] = Train[col].replace(['?'], impute)
    Test[col] = Test[col].replace(['?'], impute)
print('After Imputation', (Train[' lym'] == '?').sum())

Train.loc[Train[' lym']==impute]

## Seperating  X (Datapoints) and Y (Targets)

In [None]:
Y_train = Train[' Target']
Y_test = Test[' Target']
X_Train = Train.drop(['ID', ' Target', ' Time to recurrence'], axis=1)
X_Test = Test.drop(['ID', ' Target', ' Time to recurrence'], axis=1)
X_Train.shape, X_Test.shape

### Using Lable encoding to convert the probelm into binary {'0','1'} Classification. 

In [None]:
le = LabelEncoder()
le.fit(Y_train)
Y_Train = le.transform(Y_train)
Y_Test = le.transform(Y_test)

## (d)(i):
Gaussian Naive bayes with weighted probabilities as required in the Question.

In [None]:
p = Y_Train.sum()/len(Y_Train)
print(f'The value of p is {p} as the dataset is unbalanced')
clf = GaussianNB(priors=[1-p, p])
clf.fit(X_Train, Y_Train)

Y_train_pred = clf.predict_proba(X_Train)[:,1]
Y_test_pred = clf.predict_proba(X_Test)[:,1]

tr_fpr, tr_tpr, tr_thr = roc_curve(Y_Train, Y_train_pred)
tst_fpr, tst_tpr, te_thr = roc_curve(Y_Test, Y_test_pred)

#### The 'roc_curve' of sklearn gives the fpr, tpr and thresold values for the given Ground truth and predicted values of Target

and 'auc' function gives area under curve


In [None]:
plt.plot(tr_fpr, tr_tpr, label="Train AUC ="+str(auc(tr_fpr, tr_tpr)))
plt.plot(tst_fpr, tst_tpr, label="Test AUC ="+str(auc(tst_fpr, tst_tpr)))
plt.grid() ; plt.legend()
plt.xlabel("fpr") ;plt.ylabel("tpr") ;plt.title("ROC Curve")
plt.show()

### The 'confusion_matrix' of sklearn gives the confusion_matrix for the given ground truth and predictions

In [None]:
def predict_(proba, threshould):
    predictions = []
    for i in proba:
        if i>=threshould: predictions.append(1)
        else:             predictions.append(0)
    return predictions

print("Train confusion matrix")
print(confusion_matrix(Y_Train, predict_(Y_train_pred, 0.5)))
print("-"*100)
print("Test confusion matrix")
print(confusion_matrix(Y_Test, predict_(Y_test_pred, 0.5)))

### The 'precision_recall_fscore_support' of Sklearn gives the values as the name indicates. 

In [None]:
Tr = precision_recall_fscore_support(Y_Train, predict_(Y_train_pred, 0.5))
Te = precision_recall_fscore_support(Y_Test, predict_(Y_test_pred, 0.5))

print("for Train dataset")
print("-"*60)
print('Precisions for CLasses "N" and "R" are : ', Tr[0])
print('Recalls for CLasses "N" and "R" are : ', Tr[1])
print('Fscores for CLasses "N" and "R" are : ', Tr[2])

print("="*75)
print("for Test dataset")
print("-"*60)
print('Precisions for CLasses "N" and "R" are : ', Te[0])
print('Recalls for CLasses "N" and "R" are : ', Te[1])
print('Fscores for CLasses "N" and "R" are : ', Te[2])

# (d)(ii) Balancing the Dataset

In [None]:
#!pip install -U imbalanced-learn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
oversample = SMOTE(sampling_strategy={'R': 90}) #the default k value for KNN is 5
X_Train, Y_train = oversample.fit_resample(X_Train, Y_train)

undersample = RandomUnderSampler(sampling_strategy={'N': 90})
X_Train, Y_train = undersample.fit_resample(X_Train, Y_train)

X_Train.shape, Y_train.shape

In [None]:
le = LabelEncoder()
le.fit(Y_train)
Y_Train = le.transform(Y_train)
Y_Test = le.transform(Y_test)

In [None]:
p = Y_Train.sum()/len(Y_Train)
print(f'The value of p is {p} as the dataset is balanced')
clf = GaussianNB(priors=[1-p, p])
clf.fit(X_Train, Y_Train)

Y_train_pred = clf.predict_proba(X_Train)[:,1]
Y_test_pred = clf.predict_proba(X_Test)[:,1]

tr_fpr, tr_tpr, tr_thr = roc_curve(Y_Train, Y_train_pred)
tst_fpr, tst_tpr, te_thr = roc_curve(Y_Test, Y_test_pred)

In [None]:
plt.plot(tr_fpr, tr_tpr, label="Train AUC ="+str(auc(tr_fpr, tr_tpr)))
plt.plot(tst_fpr, tst_tpr, label="Test AUC ="+str(auc(tst_fpr, tst_tpr)))
plt.grid() ; plt.legend()
plt.xlabel("fpr") ;plt.ylabel("tpr") ;plt.title("ROC Curve")
plt.show()

In [None]:
def predict_(proba, threshould):
    predictions = []
    for i in proba:
        if i>=threshould: predictions.append(1)
        else:             predictions.append(0)
    return predictions

print("Train confusion matrix")
print(confusion_matrix(Y_Train, predict_(Y_train_pred, 0.5)))
print("-"*100)
print("Test confusion matrix")
print(confusion_matrix(Y_Test, predict_(Y_test_pred, 0.5)))

In [None]:
Tr = precision_recall_fscore_support(Y_Train, predict_(Y_train_pred, 0.5))
Te = precision_recall_fscore_support(Y_Test, predict_(Y_test_pred, 0.5))

print("for Train dataset")
print("-"*60)
print('Precisions for CLasses "N" and "R" are : ', Tr[0])
print('Recalls for CLasses "N" and "R" are : ', Tr[1])
print('Fscores for CLasses "N" and "R" are : ', Tr[2])

print("="*75)
print("for Test dataset")
print("-"*60)
print('Precisions for CLasses "N" and "R" are : ', Te[0])
print('Recalls for CLasses "N" and "R" are : ', Te[1])
print('Fscores for CLasses "N" and "R" are : ', Te[2])

# Observation



*   The Test-AUC score improved from 0.555 to 0.603
*   The Test- Precision  improved from [0.7 0.3] to [0.72222222 0.33333333]
* The Test- Recall  changed from [0.66666667 0.33333333] to [0.61904762 0.44444444]
* The Test- FScores  chenged from [0.68292683 0.31578947] to [0.66666667 0.38095238]


as observed, there were some minor improvements after using SMOTE