In [None]:
## Read the data
import numpy as np #linear algebra
import pandas as pd #data processing

import matplotlib.pyplot as plt #data visualization
import seaborn as sns #data visualization

import warnings
warnings.filterwarnings("ignore") #to ignore the warnings

#for model building
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
dataset=pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
dataset.head(2)

In [None]:
#Dropping the unwanted Unnamed:32 column
dataset=dataset.drop(columns={'Unnamed: 32'})

dataset['diagnosis']=dataset['diagnosis'].replace({'M':1,'B':0})
# Finding the correraltion between the features
corr = dataset.corr()
plt.figure(figsize=(20,20))
sns.heatmap(dataset.corr(), cmap='YlGnBu', annot = True)

In [None]:
# We will now find out the features that have a higher correlation to diagnosis
corr[abs(corr['diagnosis']) > 0.59].index

# The Local Outlier Factor

The Local Outlier Factor (LOF) algorithm is an unsupervised anomaly detection method which computes the local density deviation of a given data point with respect to its neighbors. It considers as outliers the samples that have a substantially lower density than their neighbors.

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# split the data to X and y before Local Outlier Factorization

y=dataset["diagnosis"]
X=dataset.drop(["diagnosis"],axis=1)
columns= dataset.columns.tolist()
lof= LocalOutlierFactor()
y_pred=lof.fit_predict(X)
y_pred[0:30]
x_score= lof.negative_outlier_factor_
outlier_score= pd.DataFrame()
outlier_score["score"]=x_score

lofthreshold= -2.5
loffilter= outlier_score["score"]< lofthreshold
outlier_index= outlier_score[loffilter].index.tolist()
X= X.drop(outlier_index)
y= y.drop(outlier_index).values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NeighborhoodComponentsAnalysis
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

# Dont fit the scaler while standardizate X_test !
scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

key = ['LogisticRegression','KNeighborsClassifier','SVC','DecisionTreeClassifier','RandomForestClassifier','GradientBoostingClassifier','AdaBoostClassifier','XGBClassifier']
value = [LogisticRegression(), KNeighborsClassifier(n_neighbors = 2, weights ='uniform'), SVC(kernel="rbf",random_state=15), DecisionTreeClassifier(random_state=10), RandomForestClassifier(n_estimators=60, random_state=0), GradientBoostingClassifier(random_state=20), AdaBoostClassifier(), xgb.XGBClassifier(random_state=0,booster="dart")]
models = dict(zip(key,value))
models

In [None]:
predicted =[]
for name,algo in models.items():
    model=algo
    model.fit(X_train,y_train)
    predict = model.predict(X_test)
    acc = accuracy_score(y_test, predict)
    predicted.append(acc)
    print(name,acc)

As we noticed ADaBoost Classifier have the best accuracy, we will be going ahead with that