In this notebook, I focus on different clustering techniques to cluster our data.

In [1]:
#Download libraries
from sklearn.cluster import DBSCAN
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import time


# import other functions
from imputer import *
from feature_eng import *
from drop import *

## Data Preprocessing

In [2]:
#Download the data
df = pd.read_csv('../data/train_data.zip')
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [3]:
# drop rows missing target variable
df = drop_missing_unacast(df)
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

X_train = X_train.drop(columns=['monthly_rain', 'monthly_avg_rain_length'])
X_valid = X_valid.drop(columns=['monthly_rain', 'monthly_avg_rain_length'])

#perform scaling of the numerical variables
categorical_features = X_train.loc[:, X_train.dtypes == "object"]
categorical_features.columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns = list(categorical_features.columns)))
X_valid_scaled = scaler.transform(X_valid.drop(columns = list(categorical_features.columns)))

X_train = pd.concat([pd.DataFrame(X_train_scaled, 
                                  index=X_train.index, 
                                  columns=X_train.drop(columns = list(categorical_features.columns)).columns), 
                     X_train[list(categorical_features.columns)]], axis=1)
X_valid = pd.concat([pd.DataFrame(X_valid_scaled, 
                                  index=X_valid.index, 
                                  columns=X_train.drop(columns = list(categorical_features.columns)).columns), 
                     X_valid[list(categorical_features.columns)]], axis=1)

# perform OHE (climate, density_class, income_class)
X_train_valid = clean_categorical(X_train, X_valid, ['income_class', 'density_class', 'climate'])
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

print(X_train.shape)
print(X_valid.shape)


(39592, 630)
(9898, 630)


# DBSCAN

In [11]:
t0 = time.time()
clustering = DBSCAN(eps=100, min_samples=50, metric='manhattan').fit(X_train)
t1 = time.time()

In [12]:
run_time = t1 - t0
run_time

2638.3172006607056

In [13]:
lab = clustering.labels_
lab

array([-1, -1, -1, ..., -1, -1, -1])

In [14]:
unique_elements, counts_elements = np.unique(lab, return_counts=True)

In [15]:
unique_elements

array([-1,  0,  1,  2,  3,  4,  5,  6])

In [16]:
counts_elements

array([39107,    65,    77,    85,    66,    69,    60,    63])

**Conclusion** : So fare, I haven't be able to find the right hyperparameters for DBSCAN to cluster the data the right way.

In [4]:
t2 = time.time()
clustering = DBSCAN(eps=300, min_samples=50, metric='manhattan').fit(X_train)
t3 = time.time()

In [6]:
run_time_2 = t3 - t2
run_time_2

2328.2767939567566

In [7]:
lab = clustering.labels_
lab

array([ 0,  0,  0, ..., -1,  0,  0])

In [8]:
unique_elements, counts_elements = np.unique(lab, return_counts=True)
print(unique_elements)
print(counts_elements)

[-1  0  1  2  3]
[ 5836 33553    71    66    66]


In [10]:
t4 = time.time()
clustering_2 = DBSCAN(eps=200, min_samples=50, metric='manhattan').fit(X_train)
t5 = time.time()

In [11]:
run_time_3 = t5 - t4
print(run_time_3)
lab_2 = clustering_2.labels_
print(lab_2)

2311.055247783661
[-1 -1 -1 ... -1 -1 -1]


In [12]:
unique_elements_2, counts_elements_2 = np.unique(lab_2, return_counts=True)
print(unique_elements_2)
print(counts_elements_2)

[-1  0  1  2  3  4  5  6  7]
[26754 12360    79    78    71    66    69    66    49]


In [None]:
t6 = time.time()
clustering_3 = DBSCAN(eps=400, min_samples=48, metric='manhattan').fit(X_train)
t7 = time.time()

In [None]:
run_time_4 = t7 - t6
print(run_time_4)
lab_3 = clustering_3.labels_
print(lab_3)
unique_elements_3, counts_elements_3 = np.unique(lab_3, return_counts=True)
print(unique_elements_3)
print(counts_elements_3)