In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import numpy as np

In [2]:
data = pd.read_csv('train.xlsx - train.csv')

In [3]:
data.head()

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18,target
0,-70,-61,-66,-53,-51,-63,-82,-57,-76,-78,-66,-66,-61,-59,-73,-75,-63,-77,B37
1,-77,-74,-71,-76,-65,-63,-66,-52,-55,-75,-72,-75,-74,-61,-64,-63,-53,-63,B61
2,-53,-38,-55,-66,-62,-62,-65,-70,-62,-52,-56,-53,-66,-68,-72,-60,-68,-77,A19
3,-72,-62,-59,-65,-65,-65,-78,-82,-83,-59,-84,-60,-64,-83,-69,-72,-95,-73,A22
4,-67,-69,-65,-63,-59,-53,-70,-72,-71,-60,-61,-57,-54,-76,-61,-66,-71,-80,A33


In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data.drop('target', axis=1))

In [7]:
def find_optimal_dbscan_params(X):
    best_score = -1
    best_params = {'eps': None, 'min_samples': None}
    
    eps_range = np.arange(0.6, 1.5, 0.1)
    min_samples_range = range(2, 7)
    
    for eps in eps_range:
        for min_samples in min_samples_range:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(X)
            
            if len(set(labels)) > 1:
                score = silhouette_score(X, labels)
                if score > best_score:
                    best_score = score
                    best_params['eps'] = eps
                    best_params['min_samples'] = min_samples
    
    return best_params, best_score

In [8]:
optimal_params, best_silhouette_score = find_optimal_dbscan_params(X_scaled)
eps_optimal = optimal_params['eps']
min_samples_optimal = optimal_params['min_samples']

In [9]:
dbscan = DBSCAN(eps=eps_optimal, min_samples=min_samples_optimal)
dbscan.fit(X_scaled)
data['Cluster'] = dbscan.labels_

In [10]:
print(f"Optimal DBSCAN Parameters: eps = {eps_optimal}, min_samples = {min_samples_optimal}")
print(f"Best Silhouette Score: {best_silhouette_score}")
print(data['Cluster'].value_counts())

Optimal DBSCAN Parameters: eps = 1.1999999999999997, min_samples = 5
Best Silhouette Score: 0.6562839325746399
Cluster
-1      1353
 50      417
 109     331
 54      324
 51      308
        ... 
 211       5
 141       5
 227       5
 260       5
 259       4
Name: count, Length: 263, dtype: int64


In [11]:
new_data_point = [-76, -83, -70, -66, -64, -72, -64, -69, -60, -76, -83, -78, -81, -81, -81, -70, -60, -60]
new_data_scaled = scaler.transform([new_data_point])
new_cluster = dbscan.fit_predict(new_data_scaled)



In [12]:
print(f"New data point belongs to cluster {new_cluster[0]}")

New data point belongs to cluster -1


In [13]:
data.to_csv('clustering_results.csv', index=False)