In [None]:
import pandas as pd
from sklearn.cluster import DBSCAN
import numpy as np

In [None]:
glucose = pd.read_csv('/content/diabetes.csv')
glucose[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = glucose[
    ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)
glucose.dropna(inplace=True)
glucose.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
13,1,189.0,60.0,23.0,846.0,30.1,0.398,59,1


In [None]:
columns_to_check = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

### **DBSCAN (DENSITY-BASED SPATIAL CLUSTERING OF APPLICATIONS WITH NOISE)**


DBSCAN is a density based clustering algorithm that divides a dataset into subgroups of high density regions and identifies high density regions cluster as outliers. Here cluster -1 indicates that the cluster contains outlier and rest of clusters have no outliers. This approch is similar to the K-mean clustering. There are two parameters required for DBSCAN. DBSCAN give best result for multivariate outlier detection.

1. epsilon: a distance parameter that defines the radius to search for nearby neighbors.
2. minimum amount of points required to form a cluster.

Using epsilon and minPts, we can classify each data point as:
1.   Core point –> a point that has at least a minimum number of other points (minPts) within its radius.
2.   Border point –> a point is within the radius of a core point but has less than the minimum number of other points (minPts) within its own radius.
3.   Noise point –> a point that is neither a core point or a border point

In [None]:
def DB_outliers(column):
    df = glucose[column]
    outlier_detection = DBSCAN(eps=10, min_samples=5)
    clusters = outlier_detection.fit_predict(df.values.reshape(-1, 1))
    cluster_counts = pd.Series(clusters).value_counts().sort_values(ascending=False)
    print(column + "Cluster Counts:")
    print(cluster_counts)
    print()

for column in columns_to_check:
    DB_outliers(column)


GlucoseCluster Counts:
 0    391
-1      1
dtype: int64

BloodPressureCluster Counts:
0    392
dtype: int64

SkinThicknessCluster Counts:
0    392
dtype: int64

InsulinCluster Counts:
 0    357
-1     17
 1     10
 2      8
dtype: int64

BMICluster Counts:
0    392
dtype: int64



### **IQR METHOD**

In this method by using Inter Quartile Range(IQR), we detect outliers. IQR tells us the variation in the data set.Any value, which is beyond the range of -1.5 x IQR to 1.5 x IQR treated as outliers

* Q1 represents the 1st quartile/25th percentile of the data.
* Q2 represents the 2nd quartile/median/50th percentile of the data.
* Q3 represents the 3rd quartile/75th percentile of the data.
* (Q1–1.5*IQR) represent the smallest value in the data set and (Q3+1.5*IQR) represnt the largest value in the data set.

In [None]:
columns_to_check = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

def print_outlier_count(column):
    Q1 = glucose[column].quantile(0.25)
    Q3 = glucose[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_count = glucose[(glucose[column] < lower_bound) | (glucose[column] > upper_bound)].shape[0]
    print(f"Outliers count in {column}: {outliers_count}")

for column in columns_to_check:
    print_outlier_count(column)

Outliers count in Glucose: 0
Outliers count in BloodPressure: 7
Outliers count in SkinThickness: 1
Outliers count in Insulin: 25
Outliers count in BMI: 6
