In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import math

In [None]:
# Change this to the location of your wbpc.data file
file_raw_data = "/home/ralampay/workspace/pattern-recognition-course/data/wdbc.csv"

# Data Frame
raw_data = pd.read_csv(file_raw_data, header=None)

raw_data

In [None]:
x = raw_data.iloc[:,2:32]

x

In [None]:
num_features = len(x.columns)

print("Number of Features: {}".format(num_features))

In [None]:
columns = []

for i in range(num_features):
    columns.append("x{}".format(i))

x.columns = columns

x

In [None]:
# Z-score
x_mean = x.mean()
x_std = x.std()
x_standardized = (x - x_mean)/x_std

x_standardized

In [None]:
# Scaling things from 0 - 1
x_normalized = (x - x.min()) / (x.max() - x.min())

x_normalized

In [None]:
y = raw_data[1].replace(['B'], 0).replace(['M'], 1)

y = y.values

y

In [None]:
num_benign = len(raw_data[raw_data.iloc[:,1] == 'B'])
num_malignant = len(raw_data[raw_data.iloc[:,1] == 'M'])

print("num_benign: {}".format(num_benign))
print("num_malignant: {}".format(num_malignant))

In [None]:
df = x_normalized.copy()
df['y'] = y

df

In [None]:
def partition_dataset(df, num_a=20, num_b=20, val_a=1, val_b=0):
    df_a = df[df.iloc[:,-1] == val_a].sample(num_a)
    df_b = df[df.iloc[:,-1] == val_b].sample(num_b)
    
    df.drop(df_a.index, inplace=True)
    df.drop(df_b.index, inplace=True)
    
    frames = [df_a, df_b]
    df_validation = pd.concat(frames)
    
    return df, df_validation

training, validation = partition_dataset(df, num_a=20, num_b=20)

In [None]:
training

In [None]:
validation

In [None]:
x_training = training.iloc[:,:-1].values
x_validation = validation.iloc[:,:-1].values

y_training = training['y'].values
y_validation = validation['y'].values

y_validation

In [None]:
model = RandomForestClassifier(n_estimators=15)

model.fit(x_training, y_training)

predictions = model.predict(x_validation)

predictions

In [None]:
cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1]
fn = cm[1][0]

mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
f1 = tp / (tp + (0.5 * (fp + fn)))

print(classification_report(y_validation, predictions))
print("MCC: {}".format(mcc))
print("F1: {}".format(f1))

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
model = KNeighborsClassifier(n_neighbors=10)

model.fit(x_training, y_training)

predictions = model.predict(x_validation)

predictions

In [None]:
cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1]
fn = cm[1][0]

mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
f1 = tp / (tp + (0.5 * (fp + fn)))

print(classification_report(y_validation, predictions))
print("MCC: {}".format(mcc))
print("F1: {}".format(f1))

In [None]:
# https://scikit-learn.org/stable/modules/naive_bayes.html
model = GaussianNB()

model.fit(x_training, y_training)

predictions = model.predict(x_validation)

In [None]:
cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1]
fn = cm[1][0]

mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
f1 = tp / (tp + (0.5 * (fp + fn)))

print(classification_report(y_validation, predictions))
print("MCC: {}".format(mcc))
print("F1: {}".format(f1))

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
model = LogisticRegression(random_state=0)

model.fit(x_training, y_training)

predictions = model.predict(x_validation)

predictions

In [None]:
cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1]
fn = cm[1][0]

mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
f1 = tp / (tp + (0.5 * (fp + fn)))

print(classification_report(y_validation, predictions))
print("MCC: {}".format(mcc))
print("F1: {}".format(f1))

In [None]:
# https://scikit-learn.org/stable/modules/naive_bayes.html
model = GaussianNB()

model.fit(x_training, y_training)

predictions = model.predict(x_validation)

predictions

In [None]:
cm = confusion_matrix(y_validation, predictions)
tn = cm[0][0]
tp = cm[1][1]
fp = cm[0][1]
fn = cm[1][0]

mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
f1 = tp / (tp + (0.5 * (fp + fn)))

print(classification_report(y_validation, predictions))
print("MCC: {}".format(mcc))
print("F1: {}".format(f1))