###import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans, DBSCAN
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

###loading data

In [2]:
divar_df = pd.read_csv('/content/drive/MyDrive/divar_posts_dataset.csv', index_col=[0])
dk_df = pd.read_csv('/content/drive/MyDrive/orders.csv')

In [3]:
print(divar_df.shape)
print(dk_df.shape)

(947635, 16)
(200000, 7)


#frequently used functions

In [4]:
def find_cluster_labels(kmeans, actual_labels):
    # Associates most probable label with each cluster in KMeans model

    inferred_labels = {}
    for i in range(kmeans.n_clusters):
        labels = []
        index = np.where(kmeans.labels_ == i)

        labels.append(actual_labels[index])
        if len(labels[0]) == 1:
            counts = np.bincount(labels[0])
        else:
            counts = np.bincount(np.squeeze(labels))

        # assign the cluster to a value in the inferred_labels dictionary
        if np.argmax(counts) in inferred_labels:
            inferred_labels[np.argmax(counts)].append(i)
        else:
            inferred_labels[np.argmax(counts)] = [i]

        print('Cluster: {}, label: {}'.format(i, np.argmax(counts)))
        
    return inferred_labels  

def find_data_labels(X_labels, cluster_labels):
    # Determines label for each array, depending on the cluster it has been assigned to.
    
    predicted_labels = np.zeros(len(X_labels)).astype(np.uint8)
    for i, cluster in enumerate(X_labels):
        for key, value in cluster_labels.items():
            if cluster in value:
                predicted_labels[i] = key
                
    return predicted_labels

In [5]:
def evaluate_kmeans(estimator, labels):
    contingency_matrix = metrics.cluster.contingency_matrix(labels, estimator.labels_)
    
    print('Number of Clusters: ' + str(estimator.n_clusters))
    print("purity:" + str(np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)))

#part 1

prepare data for clustering

In [6]:
divar_df = divar_df.loc[~(divar_df['cat2'].isnull())]

cat1_le = preprocessing.LabelEncoder()
cat2_le = preprocessing.LabelEncoder()
city_le = preprocessing.LabelEncoder()

divar_df['cat1'] = cat1_le.fit_transform(divar_df['cat1'])
divar_df['cat2'] = cat2_le.fit_transform(divar_df['cat2'])
divar_df['city'] = city_le.fit_transform(divar_df['city'])

X = divar_df[["cat1", "cat2"]].values
Y = divar_df[["city"]].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.2)

define clustering function

In [7]:
clusters_number = divar_df['city'].nunique()
print("number of cities exists in dataset: " + str(clusters_number))

kmeans = MiniBatchKMeans(n_clusters = clusters_number)
kmeans.fit(X_train)

number of cities exists in dataset: 9


MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=9, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

clustering and find each clusters label

In [8]:
cluster_labels = find_cluster_labels(kmeans, Y_train)
X_clusters = kmeans.predict(X_train)
predicted_labels = find_data_labels(X_clusters, cluster_labels)

Cluster: 0, label: 8
Cluster: 1, label: 8
Cluster: 2, label: 8
Cluster: 3, label: 8
Cluster: 4, label: 8
Cluster: 5, label: 8
Cluster: 6, label: 8
Cluster: 7, label: 8
Cluster: 8, label: 8


evaluate clustering

In [9]:
evaluate_kmeans(kmeans, Y_train)

Number of Clusters: 9
purity:0.4666083433218669


#part2

prepare data for clustering

In [10]:
dk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ID_Order               200000 non-null  int64  
 1   ID_Customer            200000 non-null  int64  
 2   ID_Item                200000 non-null  int64  
 3   DateTime_CartFinalize  200000 non-null  object 
 4   Amount_Gross_Order     200000 non-null  float64
 5   city_name_fa           200000 non-null  object 
 6   Quantity_item          200000 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 10.7+ MB


In [11]:
city_le = preprocessing.LabelEncoder()

dk_df['city_name_fa'] = city_le.fit_transform(dk_df['city_name_fa'])

X = dk_df[['ID_Item']].values
Y = dk_df[['city_name_fa']].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.2)

define clustering function

In [12]:
cities_number = dk_df['city_name_fa'].nunique()
print("number of cities exists in dataset: " + str(cities_number))

kmeans = MiniBatchKMeans(n_clusters = 20)
kmeans.fit(X_train)

number of cities exists in dataset: 906


MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=20, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

clustering and find each cluster's label

In [13]:
cluster_labels = find_cluster_labels(kmeans, Y_train)
X_clusters = kmeans.predict(X_train)
predicted_labels = find_data_labels(X_clusters, cluster_labels)

Cluster: 0, label: 215
Cluster: 1, label: 215
Cluster: 2, label: 215
Cluster: 3, label: 215
Cluster: 4, label: 215
Cluster: 5, label: 215
Cluster: 6, label: 215
Cluster: 7, label: 215
Cluster: 8, label: 215
Cluster: 9, label: 215
Cluster: 10, label: 215
Cluster: 11, label: 215
Cluster: 12, label: 215
Cluster: 13, label: 215
Cluster: 14, label: 215
Cluster: 15, label: 215
Cluster: 16, label: 215
Cluster: 17, label: 215
Cluster: 18, label: 215
Cluster: 19, label: 215


evaluate clustering

In [14]:
evaluate_kmeans(kmeans, Y_train)

Number of Clusters: 20
purity:0.54169375


#part 4

prepare data for clustering

In [15]:
divar_df = divar_df.loc[~(divar_df['cat2'].isnull())]
divar_df = divar_df.loc[~(divar_df['price'] == -1)]


# cat1 & cat2 have already encoded in part 1

price_le = preprocessing.StandardScaler()

# divar_df['price'] = city_le.fit_transform(divar_df['price'])

X = divar_df[["cat1", "cat2"]].values
Y = divar_df[["price"]].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.2)

define clustering function

In [16]:
kmeans = MiniBatchKMeans(n_clusters=10)
kmeans.fit(X_train)

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=10, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

clustering and find each clusters label

In [17]:
cluster_labels = find_cluster_labels(kmeans, Y_train)
X_clusters = kmeans.predict(X_train)
predicted_labels = find_data_labels(X_clusters, cluster_labels)

Cluster: 0, label: 150000
Cluster: 1, label: 50000
Cluster: 2, label: 12000000
Cluster: 3, label: 100000
Cluster: 4, label: 300000
Cluster: 5, label: 100000
Cluster: 6, label: 100000
Cluster: 7, label: 50000
Cluster: 8, label: 100000
Cluster: 9, label: 200000


evaluate clustering

In [18]:
Y_train = Y_train.ravel()
correct =0
for i in range(len(Y_train)):
  alpha = 0.9
  max_price = Y_train[i] * (1 + alpha)
  min_price = Y_train[i] * (1 - alpha)
  if min_price < predicted_labels[i] < max_price:
    correct += 1

print('Number of Clusters: ' + str(kmeans.n_clusters))
print("accuracy: " + str(correct/len(Y_train)))

Number of Clusters: 10
accuracy: 0.003503497127132356
