In [2]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist, squareform
import pandas as pd
import pyod

In [3]:
df = pd.read_csv("data/bcw_data.csv")

In [4]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

In [5]:
df.isna().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

In [6]:
df["diagnosis"].value_counts()
# B = 0
# M = 1

B    357
M    212
Name: diagnosis, dtype: int64

In [7]:
df.dtypes

id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst     

In [8]:
df.drop(columns=["Unnamed: 32"], inplace=True)

In [9]:
def encode_target(x):
    if x == "M":
        return 1
    return 0

In [10]:
df["diagnosis"] = df["diagnosis"].apply(lambda x: encode_target(x))

In [11]:
b_df = df[df["diagnosis"] == 0]
m_df = df[df["diagnosis"] == 1]

In [12]:
out_df = b_df.append(m_df[:6]).sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
out_df.head(3)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,89382601,0,14.61,15.69,92.68,664.9,0.07618,0.03515,0.01447,0.01877,...,16.46,21.75,103.7,840.8,0.1011,0.07087,0.04746,0.05813,0.253,0.05695
1,862261,0,9.787,19.94,62.11,294.5,0.1024,0.05301,0.006829,0.007937,...,10.92,26.29,68.81,366.1,0.1316,0.09473,0.02049,0.02381,0.1934,0.08988
2,859196,0,9.173,13.86,59.2,260.9,0.07721,0.08751,0.05988,0.0218,...,10.01,19.23,65.59,310.1,0.09836,0.1678,0.1397,0.05087,0.3282,0.0849


In [14]:
out_df.diagnosis.value_counts()

0    357
1      6
Name: diagnosis, dtype: int64

In [15]:
'''
Total data = 367
Outlier placed = 6
1.64% of outliers are now placed
'''

'\nTotal data = 367\nOutlier placed = 6\n1.64% of outliers are now placed\n'

In [16]:
# Init RBDA
from rbda import RBOD
# Create the similarity matrix
X_train = out_df.iloc[:,2:].to_numpy()

In [17]:
C = np.zeros((X_train.shape[0], X_train.shape[0]))
# A simple euclidean distance over the synthethic dataset. Not against our similarity
for i in range(0, len(X_train)):
    for j in range(0, len(X_train)):
        dist = np.linalg.norm(X_train[i].reshape(1, -1) - X_train[j].reshape(1, -1))
        C[i][j] = dist

C_df = pd.DataFrame(C)
C_df.insert(0, "id", out_df["id"])

In [18]:
X_train.shape

(363, 30)

In [19]:
#outlier_output_file = open("rbod_eval_outliers.csv", "w+")
#outlier_output_file.write("k,precision_n,roc_auc,algorithm\n")
ids = out_df["id"].to_numpy().reshape(len(out_df["id"]), 1)
X_train = np.hstack((ids, X_train))
y_train = out_df["diagnosis"].to_numpy()

In [20]:
k_range = [_ for _ in range(3, 60, 2)]
z_val = 2.5
for k in k_range:
    print("Value of k ", k)
    rbod = RBOD(C_df, kneighbors=k, z_val=z_val)
    combination_dict = {}
    rbod = RBOD(C_df, kneighbors=k)
    combination_dict["outliers"] = rbod.detect(X_train)
    #To show labels for RBDA
    # This code based on numpy executions of precision_scoring
    rbod_decision_scores = np.asarray([val[1] for val in combination_dict["outliers"]])
    #threshold = np.percentile(rbod_decision_scores, 100 * (1 - contamination))
    threshold = z_val
    rbod_labels = (rbod_decision_scores > threshold).astype('int')
    print("Classifier RBDA Outlier labels are - {}".format(rbod_labels))
    from pyod.utils import evaluate_print
    from sklearn.metrics import roc_auc_score
    roc_rbod = np.round(roc_auc_score(y_train,
                                    [val[1] for val in combination_dict["outliers"]]), decimals=4)
    print("AUC Score for k-{},{}".format(k, roc_rbod))
    #outlier_output_file.write("".join(str(k) + "," + str(prn_rbod) + "," + str(roc_rbod) + "," + "RBOD" + "\n"))

 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
AUC Score for k-11,0.9991
Value of k  13
Classifier RBDA Outlier labels are - [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [None]:
#Finally close the file
#outlier_output_file.close()