In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy import sparse
!pip install kneed
from kneed import KneeLocator

##Reading Data

In [2]:
all_years_df = pd.read_excel('/content/drive/MyDrive/Clean Data/KMeans Data/finall_v_income_cost.xlsx', usecols=['totall_income','totall_cost'])
all_years_df.rename(columns={'totall_income':'Incomes','totall_cost':'Costs'},inplace=True)
all_years_df.fillna(0,inplace=True)
all_years_df['Incomes'] = all_years_df['Incomes'].astype('int')

##DBSCAN using all of data

In [None]:
all_clusters = []
for min_pts in range(2, 21):
    nn = NearestNeighbors(n_neighbors=min_pts).fit(all_years_df)
    neigh_dist, _ = nn.kneighbors()
    borders = np.sort(neigh_dist[:,-1])
    knee_loc = KneeLocator(np.arange(len(borders)),borders,curve='convex')
    eps = borders[knee_loc.elbow]
    dbscan = DBSCAN(eps=eps, min_samples=min_pts)
    clusters = dbscan.fit_predict(all_years_df)
    print(min_pts,np.unique(clusters,return_counts=True),eps)
    clusters_num = np.unique(clusters)
    all_clusters.append((min_pts,eps,clusters_num))

Running the cell above cause the system to crash because DBSCAN is so computationally expensive

## Scaling Data

In [3]:
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(all_years_df[['Incomes', 'Costs']]),columns=all_years_df.columns)

In [None]:
all_clusters = []
for min_pts in range(2, 21):
    nn = NearestNeighbors(n_neighbors=min_pts).fit(scaled_data)
    neigh_dist, _ = nn.kneighbors()
    borders = np.sort(neigh_dist[:,-1])
    knee_loc = KneeLocator(np.arange(len(borders)),borders,curve='convex')
    eps = borders[knee_loc.elbow]
    dbscan = DBSCAN(eps=eps, min_samples=min_pts)
    clusters = dbscan.fit_predict(scaled_data)
    print(min_pts,np.unique(clusters,return_counts=True))
    clusters_num = np.unique(clusters)
    all_clusters.append((min_pts,eps,clusters_num))

Even after scaling the data , the session crashed.

## Sampling Data

In [5]:
data_sampled = scaled_data.sample(n=25000, random_state=42)

In [6]:
all_clusters = []
for min_pts in range(2, 21):
    nn = NearestNeighbors(n_neighbors=min_pts).fit(data_sampled)
    neigh_dist, _ = nn.kneighbors()
    borders = np.sort(neigh_dist[:,-1])
    knee_loc = KneeLocator(np.arange(len(borders)),borders,curve='convex')
    eps = borders[knee_loc.elbow]
    dbscan = DBSCAN(eps=eps, min_samples=min_pts)
    clusters = dbscan.fit_predict(data_sampled)
    print(min_pts,np.unique(clusters,return_counts=True))
    clusters_num = np.unique(clusters)
    all_clusters.append((min_pts,eps,clusters_num))

2 (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]), array([   32, 24936,     2,     2,     2,     4,     3,     2,     2,
           6,     2,     5,     2]))
3 (array([-1,  0,  1,  2]), array([   23, 24968,     6,     3]))
4 (array([-1,  0,  1]), array([   46, 24949,     5]))
5 (array([-1,  0,  1]), array([   46, 24949,     5]))
6 (array([-1,  0,  1]), array([   75, 24920,     5]))
7 (array([-1,  0]), array([   39, 24961]))
8 (array([-1,  0]), array([   67, 24933]))
9 (array([-1,  0]), array([   67, 24933]))
10 (array([-1,  0]), array([   64, 24936]))
11 (array([-1,  0]), array([   69, 24931]))
12 (array([-1,  0]), array([   68, 24932]))
13 (array([-1,  0]), array([   55, 24945]))
14 (array([-1,  0]), array([   61, 24939]))
15 (array([-1,  0]), array([   52, 24948]))
16 (array([-1,  0]), array([  114, 24886]))
17 (array([-1,  0]), array([  111, 24889]))
18 (array([-1,  0]), array([   53, 24947]))
19 (array([-1,  0]), array([   52, 24948]))
20 (array([-1,  0]), array([  125

Finally We're getting some results but they are both meaningless and worthless

##Sparsing Data

In [11]:
sparse_matrix = sparse.csr_matrix(data_sampled)
sparse_matrix = pd.DataFrame(scaler.fit_transform(data_sampled[['Incomes', 'Costs']]),columns=all_years_df.columns)

In [12]:
all_clusters = []
for min_pts in range(2, 21):
    nn = NearestNeighbors(n_neighbors=min_pts).fit(sparse_matrix)
    neigh_dist, _ = nn.kneighbors()
    borders = np.sort(neigh_dist[:,-1])
    knee_loc = KneeLocator(np.arange(len(borders)),borders,curve='convex')
    eps = borders[knee_loc.elbow]
    dbscan = DBSCAN(eps=eps, min_samples=min_pts)
    clusters = dbscan.fit_predict(sparse_matrix)
    print(min_pts,np.unique(clusters,return_counts=True))
    clusters_num = np.unique(clusters)
    all_clusters.append((min_pts,eps,clusters_num))

2 (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20]), array([   63, 24884,     2,     3,     2,     3,     2,     2,     3,
           2,     6,     2,     3,     4,     2,     3,     4,     2,
           2,     2,     2,     2]))
3 (array([-1,  0,  1,  2,  3,  4,  5,  6,  7]), array([   98, 24872,     3,     5,     5,     3,     6,     4,     4]))
4 (array([-1,  0]), array([   45, 24955]))
5 (array([-1,  0,  1]), array([   30, 24965,     5]))
6 (array([-1,  0]), array([   45, 24955]))
7 (array([-1,  0]), array([   42, 24958]))
8 (array([-1,  0]), array([   56, 24944]))
9 (array([-1,  0]), array([   66, 24934]))
10 (array([-1,  0]), array([   58, 24942]))
11 (array([-1,  0]), array([   39, 24961]))
12 (array([-1,  0]), array([  112, 24888]))
13 (array([-1,  0]), array([   61, 24939]))
14 (array([-1,  0]), array([   64, 24936]))
15 (array([-1,  0]), array([   41, 24959]))
16 (array([-1,  0]), array([   53, 24947]))
17 (array([-1,  0])

Again the results are meaningless and worthless.

##In conclusion the DBSCAN is not a suitable algorithm for our dataset and we should stick to KMeans algorithm