In [1]:
from pyspark import SparkContext
import csv
from pyspark.sql import SparkSession
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import col
from datetime import datetime
from pyspark.mllib.clustering import BisectingKMeans
from sklearn.cluster import KMeans
import pandas as pd
from math import sqrt
import numpy as np
import re
import ast
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as spclust
from scipy.spatial import distance
import matplotlib.pyplot as plt

In [2]:
#local spark
sc = SparkContext(appName="Clusters")
#sqlc = SQLContext(sc)
#vocareum
#sc = spark.sparkContext

In [3]:
def read_tracks():
        tracks = pd.read_csv("tracks.csv", index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        try:
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                    'category', categories=SUBSETS, ordered=True)
        except (ValueError, TypeError):
            # the categories and ordered arguments were removed in pandas 0.25
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                     pd.CategoricalDtype(categories=SUBSETS, ordered=True))

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')

        return tracks


In [4]:
def read_features():
     return pd.read_csv("features.csv", index_col=0, header=[0, 1, 2])

In [5]:
data = read_tracks()#data = data.map(lambda line: line.split(",")).filter(lambda v: v[0]!="START")
track_genres = data['track','genre_top']
data = data[data['set','subset']=='small']
#data.iloc[:,0:0]
subset_ids = [i for i in data.index]

In [6]:
subset_ids
#features=read_features()
#features[small]

[2,
 5,
 10,
 140,
 141,
 148,
 182,
 190,
 193,
 194,
 197,
 200,
 203,
 204,
 207,
 210,
 211,
 212,
 213,
 255,
 256,
 368,
 424,
 459,
 534,
 540,
 546,
 574,
 602,
 615,
 620,
 621,
 625,
 666,
 667,
 676,
 690,
 694,
 695,
 704,
 705,
 706,
 707,
 708,
 709,
 714,
 715,
 716,
 718,
 777,
 814,
 821,
 822,
 825,
 853,
 890,
 892,
 897,
 993,
 995,
 997,
 998,
 1039,
 1040,
 1066,
 1069,
 1073,
 1075,
 1082,
 1083,
 1087,
 1102,
 1193,
 1195,
 1196,
 1197,
 1249,
 1259,
 1270,
 1276,
 1277,
 1278,
 1417,
 1427,
 1443,
 1482,
 1510,
 1544,
 1642,
 1644,
 1649,
 1661,
 1663,
 1666,
 1673,
 1680,
 1681,
 1682,
 1683,
 1684,
 1685,
 1686,
 1687,
 1688,
 1689,
 1701,
 1702,
 1703,
 1704,
 1706,
 1720,
 1732,
 1733,
 1735,
 1736,
 1883,
 1891,
 1893,
 1924,
 1925,
 1929,
 1930,
 2012,
 2096,
 2097,
 2099,
 3263,
 3264,
 3265,
 3266,
 3270,
 3271,
 3272,
 3273,
 3274,
 3492,
 3532,
 3533,
 3534,
 3535,
 3537,
 3538,
 3573,
 3598,
 3624,
 3707,
 3708,
 3720,
 3721,
 3722,
 3724,
 3725,
 37

In [7]:
feature_data = read_features()
feature_data

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.347620,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.758890,0.459473,0.085629,0.071289,0.000000,2.089872,0.061448
3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,0.619185,...,0.063831,0.014212,0.017740,2.824694,0.466309,0.084578,0.063965,0.000000,1.716724,0.069330
5,0.527563,-0.077654,-0.279610,0.685883,1.937570,0.880839,-0.923192,-0.927232,0.666617,1.038546,...,0.040730,0.012691,0.014759,6.808415,0.375000,0.053114,0.041504,0.000000,2.193303,0.044861
10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,1.982386,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.000000,3.542325,0.040800
20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.816410,0.043851,-0.804761,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,-0.490129,0.463834,2.321970,-0.084352,1.662914,2.115189,-0.237794,5.695442,0.830353,1.951819,...,0.128410,0.022547,0.019816,4.448255,0.172852,0.028773,0.028320,0.003906,0.955388,0.012385
155317,-0.461559,-0.229601,-0.496632,-0.422033,0.130612,-0.263825,-0.628103,-0.082687,-0.229483,-0.492753,...,0.132964,0.023548,0.026527,3.270612,0.196289,0.031116,0.027832,0.002441,1.283060,0.019059
155318,0.552473,-0.110498,-0.532014,0.263131,-0.224011,-0.530972,1.713526,1.418444,1.325197,0.120333,...,0.108324,0.017540,0.020471,2.356727,0.212891,0.038450,0.037109,0.003418,0.828569,0.017904
155319,-0.176901,0.187208,-0.050664,0.368843,0.066005,-0.857354,-0.780860,0.626281,-0.630938,-0.787229,...,0.088311,0.018328,0.017936,6.188604,0.167480,0.041480,0.038086,0.004883,1.818740,0.020133


In [8]:
#features = sc.textFile("features.csv")
#useless = features.take(4)
#useless


In [9]:
#filtered_features = features.filter(lambda a : not a in useless).map(lambda a : (int(a[0]),[float(i) for i in a.split(",")[1:]]))
#filtered_features.take(10)

In [10]:
pd_features = read_features()


In [11]:
pd_features
#pd_features = np.delete(pd_features.to_numpy(), mask, 1)

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.347620,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.758890,0.459473,0.085629,0.071289,0.000000,2.089872,0.061448
3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,0.619185,...,0.063831,0.014212,0.017740,2.824694,0.466309,0.084578,0.063965,0.000000,1.716724,0.069330
5,0.527563,-0.077654,-0.279610,0.685883,1.937570,0.880839,-0.923192,-0.927232,0.666617,1.038546,...,0.040730,0.012691,0.014759,6.808415,0.375000,0.053114,0.041504,0.000000,2.193303,0.044861
10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,1.982386,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.000000,3.542325,0.040800
20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.816410,0.043851,-0.804761,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,-0.490129,0.463834,2.321970,-0.084352,1.662914,2.115189,-0.237794,5.695442,0.830353,1.951819,...,0.128410,0.022547,0.019816,4.448255,0.172852,0.028773,0.028320,0.003906,0.955388,0.012385
155317,-0.461559,-0.229601,-0.496632,-0.422033,0.130612,-0.263825,-0.628103,-0.082687,-0.229483,-0.492753,...,0.132964,0.023548,0.026527,3.270612,0.196289,0.031116,0.027832,0.002441,1.283060,0.019059
155318,0.552473,-0.110498,-0.532014,0.263131,-0.224011,-0.530972,1.713526,1.418444,1.325197,0.120333,...,0.108324,0.017540,0.020471,2.356727,0.212891,0.038450,0.037109,0.003418,0.828569,0.017904
155319,-0.176901,0.187208,-0.050664,0.368843,0.066005,-0.857354,-0.780860,0.626281,-0.630938,-0.787229,...,0.088311,0.018328,0.017936,6.188604,0.167480,0.041480,0.038086,0.004883,1.818740,0.020133


In [12]:
# these were the features that cause errors in my experiments
mask = [ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191]
mask2 = [i-1 for i in mask]

pd_small_subset  = pd_features[pd_features.index.isin(subset_ids)]
pd_small_subset = np.delete(pd_small_subset.to_numpy(), mask, 1)

#pd_rest = pd_features[~pd_features.index.isin(subset_ids)]
#pd_rest = np.delete(pd_rest.to_numpy(), mask, 1)

In [13]:
pd_small_subset

array([[ 7.18065262e+00,  5.23030901e+00,  2.49320805e-01, ...,
         0.00000000e+00,  2.08987212e+00,  6.14481084e-02],
       [ 5.27562976e-01, -7.76543170e-02, -2.79610306e-01, ...,
         0.00000000e+00,  2.19330311e+00,  4.48606014e-02],
       [ 3.70224547e+00, -2.91193038e-01,  2.19674206e+00, ...,
         0.00000000e+00,  3.54232454e+00,  4.08004485e-02],
       ...,
       [-2.14509025e-01, -1.13046920e+00,  7.18534231e-01, ...,
         9.76562500e-04,  3.73664618e+00,  2.38210093e-02],
       [-4.87370938e-01, -9.23753798e-01, -2.83099025e-01, ...,
         3.41796875e-03,  3.99705172e+00,  4.57333475e-02],
       [ 4.42161486e-02, -3.00441355e-01, -2.17022225e-01, ...,
         0.00000000e+00,  9.68863487e+00,  3.07869967e-02]])

In [14]:
#pd_rest

In [15]:
"""

plt.figure(figsize=(20, 7))  
plt.title("Dendrograms")  

# Create dendrogram
spclust.dendrogram(spclust.linkage(pd_small_subset, method='ward'))

plt.title('Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Euclidean distance')
"""

'\n\nplt.figure(figsize=(20, 7))  \nplt.title("Dendrograms")  \n\n# Create dendrogram\nspclust.dendrogram(spclust.linkage(pd_small_subset, method=\'ward\'))\n\nplt.title(\'Dendrogram\')\nplt.xlabel(\'Sample index\')\nplt.ylabel(\'Euclidean distance\')\n'

In [16]:
def get_stats(all_entries,labels,k):
    all_clusters = get_all_clusters(all_entries, labels, k)
    p=0
    radius_list=[]
    density_list=[]
    for cluster in all_clusters:
      
        centroid = get_cluster_centroid(cluster)
        radius=get_cluster_radius(cluster,centroid)
        #diameter = get_cluster_diameter(cluster)
        density = len(cluster)/(radius**2)
        radius_list.append(radius)
        density_list.append(density)
        #print("cluster ",p,"\nradius: " + str(radius) + "\ndiameter: " + str(diameter) + "\ndensity: " + str(density))
        #print("cluster ",p,"\nradius: " + str(radius) +  "\ndensity: " + str(density))
        p+=1
    print("average radius: ",sum(radius_list)/len(radius_list))
    print("average density: ",sum(density_list)/len(density_list))

In [17]:
def get_clusters(entries, labels, i):
    cluster_i = np.array([entries[j] for j in range(len(labels)) if labels[j]==i])
    return cluster_i

    

In [18]:
def get_all_clusters(entries, labels, k):
    all_clusters = []
    for i in range(k):
        all_clusters.append(get_clusters(entries, labels, i))
    return all_clusters #np.array(all_clusters)

In [19]:
def get_cluster_centroid(values):
    return np.mean(values,axis=0)

In [20]:
def get_cluster_radius(values, centroid):
    
    max_dist=0
    for i in range(len(values)):
       #maybe vectorize
      
        dist = distance.euclidean(values[i],centroid)
        if dist > max_dist:
            max_dist = dist
    return max_dist

In [21]:
def get_cluster_diameter(values):
    max_dist=0
    for i in range(len(values)):
        for j in range(i+1,len(values)):
            dist = distance.euclidean(values[i],values[j])
            if dist > max_dist:
                max_dist = dist
    return max_dist

In [22]:
"""

min_clusters = 8
#max_clusters = min_clusters
max_clusters = 16
for k in range(min_clusters,max_clusters+1):
    cluster = AgglomerativeClustering(
        n_clusters=k, affinity='euclidean', linkage='ward')

    cluster.fit(pd_small_subset)
    labels = cluster.labels_
    print("for ",k," clusters");
    #print(labels)
    
    #k_clusters = pd_small_subset.copy()
    
    np_clusters = pd_small_subset#.to_numpy()    
 
    get_stats(np_clusters, labels, k)
"""

'\n\nmin_clusters = 8\n#max_clusters = min_clusters\nmax_clusters = 16\nfor k in range(min_clusters,max_clusters+1):\n    cluster = AgglomerativeClustering(\n        n_clusters=k, affinity=\'euclidean\', linkage=\'ward\')\n\n    cluster.fit(pd_small_subset)\n    labels = cluster.labels_\n    print("for ",k," clusters");\n    #print(labels)\n    \n    #k_clusters = pd_small_subset.copy()\n    \n    np_clusters = pd_small_subset#.to_numpy()    \n \n    get_stats(np_clusters, labels, k)\n'

In [23]:
k_initial = 9
threshold = sqrt(494)

In [24]:
def cluster_summary(entries, labels, i):
    cluster_i = np.array([entries[j] for j in range(len(labels)) if labels[j]==i])
    n = len(cluster_i)
    
    sum_v = np.sum(cluster_i,axis=0)
    sq_cluster_i = np.array([k**2 for k in cluster_i])
    sum_sq_v = np.sum(sq_cluster_i,axis=0)
    return (n,sum_v,sum_sq_v)

    

In [25]:
#print(filtered_features.count())
#print(len(subset_ids))
#filtered_features = filtered_features.filter(lambda a : not a[0] in subset_ids)
#print(filtered_features.count())

In [26]:
def mahalanobis(point,c_centroid,c_std_dev):
    #m = sum([(point[i]-c_centroid[i])/c_std_dev[i] for i in range(len(point))]) #assuming point is only values and no id
    m = [((point[i]-c_centroid[i])/c_std_dev[i])**2 for i in range(len(point))] #assuming point is only values and no id
    #for el in m:
    #    print(m)
    return sqrt(sum(m))
    

In [27]:
def cluster_std_deviation(cluster):
    std_dev = [sqrt((cluster[2][i]/cluster[0])-(cluster[1][i]/cluster[0])**2) for i in range(len(cluster[1]))]
    return std_dev

In [28]:
def cluster_centroid(cluster):
    
    centroid = [cluster[1][i]/cluster[0] for i in range(len(cluster[1]))]
    return centroid

In [29]:
initial_clusters = []
cluster = AgglomerativeClustering(
        n_clusters=k_initial, affinity='euclidean', linkage='ward')

cluster.fit(pd_small_subset)
labels = cluster.labels_
np_clusters = pd_small_subset#.to_numpy()  
for i in range(k_initial):
    initial_clusters.append(cluster_summary(np_clusters,labels,i))
    

print(initial_clusters)

[(527, array([ 4.12996771e+01,  2.72319464e+01,  8.39947401e+01,  3.12199200e+01,
        7.60747291e+01,  7.33772574e+01,  5.35941987e+01,  5.66668909e+01,
        5.19419962e+01,  7.39402543e+01,  4.53787632e+01,  5.18019305e+01,
        3.22810711e+02,  3.03639773e+02,  3.14979632e+02,  3.01984638e+02,
        3.14299235e+02,  3.08396149e+02,  3.03042518e+02,  3.18489643e+02,
        3.03549957e+02,  3.13147622e+02,  3.02389132e+02,  3.07868763e+02,
        1.39817147e+02,  1.34153031e+02,  1.31454195e+02,  1.34214424e+02,
        1.38645631e+02,  1.34752172e+02,  1.33817972e+02,  1.37513224e+02,
        1.35626793e+02,  1.31086456e+02,  1.27199909e+02,  1.27778915e+02,
        1.35015233e+02,  1.30152124e+02,  1.25801917e+02,  1.30659340e+02,
        1.34162778e+02,  1.30965484e+02,  1.30418164e+02,  1.32235022e+02,
        1.31584917e+02,  1.24928102e+02,  1.22700272e+02,  1.22411414e+02,
        9.77492290e+00,  9.43985514e+00,  8.95321417e+00,  8.52307289e+00,
        7.62461739

In [30]:
for c in initial_clusters:
    print(c[0], len(c[1]))

527 494
2125 494
372 494
845 494
324 494
1611 494
464 494
439 494
1293 494


In [31]:
cluster_rdd = sc.parallelize(initial_clusters)

##############broadcast cluster_rdd, for every point in new points check if it enters any cluster, if so map with such cluster id otherwise
##############use special key to indicate not discard set, at the end of every iteration, update clusters
#cluster_rdd.take(2)

In [32]:
#tracks_rdd = sc.parallelize(pd_rest)


In [33]:
#cluster_rdd.map(cluster_centroid).take(3)

In [34]:
def isSummarized2(point, centroids, cluster_stds):
    #print("lenpoint ", len(point))
    possible_centroids={}
    for i in range(len(centroids)):
       
        
        mahal = mahalanobis(point,centroids[i],cluster_stds[i]) 
        #print("cluster ", i, "with len ", len(centroids[i])," with std dev len " ,len(cluster_stds[i]),"with mahal ", mahal)
        if(mahal) <= threshold:
            possible_centroids[i]=mahal
    if len(possible_centroids)<1:
        return -1
    else:
        return min(possible_centroids, key=possible_centroids.get)
    

In [35]:
def isSummarized(point,centroids, std_deviations):
    possible_centroids={}
    for i in range(len(centroids)):
        mahal = mahalanobis(point,centroids,std_deviations) 
        if(mahal) <= threshold:
            possible_centroids[i]=mahal
    if len(possible_centroids)<1:
        return -1
    else:
        return min(possible_centroids, key=possible_centroids.get)

In [36]:
def add_to_cluster(cluster, points):
    
    new_n = cluster[0] + len(points)
    new_sum = cluster[1]
    new_sumsq = cluster[2]
    for point in points:
        new_sum = [new_sum[i] + point[i] for i in range(len(point))]
        new_sumsq = [new_sumsq[i] + point[i]**2 for i in range(len(point))]
    return (new_n,new_sum,new_sumsq)

In [37]:
def merge_clusters(cluster1,cluster2):
    new_n = cluster1[0] + cluster2[0]
    new_sum = [cluster1[1][i] + cluster2[1][i] for i in range(len(cluster1[1])) ]
    new_sumsq = [cluster1[2][i] + cluster2[2][i] for i in range(len(cluster1[2])) ]
    return (new_n,new_sum,new_sumsq)

In [38]:
#smaller_test = sc.parallelize(tracks_rdd.take(5000))
#print(smaller_test[:2])
#bfr(1,cluster_rdd,smaller_test)
#tracks_rdd.take(1)

In [39]:
def improv_bfr( ds_rdd, dataFile, batchSize):
    with open(dataFile, mode="r") as file:
        csvFile = csv.reader(file)
        std_deviations = ds_rdd.map(cluster_std_deviation).collect()
        centroids = ds_rdd.map(cluster_centroid).collect()
        skip=0
        rs_rdd=sc.parallelize([])
        cs_rdd=sc.parallelize([])
        for line in csvFile:
            if skip==3:
                break
            skip+=1
        done=False
        iteration=1
        while not done:
        #for i in range(5):
            print("ITERATION ", iteration)
            points = []
            amount=0
            for line in csvFile:
                if amount<batchSize:
                    if not int(line[0]) in subset_ids:
                        points.append(np.array(line[1:]).astype(float))
                        amount+=1
                else:
                    break
            if amount < batchSize:
                done=True
            points = np.asarray(points)        
            #print(points.shape)
            #print(mask)
            points = np.delete(points, mask, 1)
            print(points.shape)
            data = sc.parallelize(points)
            
            changes = bfr_iteration(ds_rdd,cs_rdd,rs_rdd,data)
            print("RETURNED")
            print("NEW CS SIZE", len(changes[1]))
            print("NEW RS SIZE", len(changes[2]))
            ds_rdd =sc.parallelize(changes[0])
            cs_rdd =sc.parallelize(changes[1])
            rs_rdd =sc.parallelize(changes[2])
        ###after its done merge compressed sets and rs to ds clusters
        finalDSList = ds_rdd.collect()
        DSCentroids = ds_rdd.map(cluster_centroid).collect()
       
        DSSTDD = ds_rdd.map(cluster_std_deviation).collect()
        CSCentroids = cs_rdd.map(cluster_centroid)
       
        finalUnassigned = CSCentroids.union(rs_rdd)
        finalUnassigned = finalUnassigned.map(lambda a: (a, closest_cluster(a, DSCentroids, DSSTDD)))
        finalClusters = []
        print("Done with last iteration, merging CS and RS to closest DS cluster")
        for i in range(len(finalDSList)):
      
            toAdd = finalUnassigned.filter(lambda a: a[1]==i).map(lambda a : a[0]).collect()
            newCluster = add_to_cluster(finalDSList[i],toAdd)
            finalClusters.append(newCluster)
            
        return finalClusters

In [40]:
def bfr_iteration( ds_rdd,cs_rdd, rs_rdd, data):
   
 
    cluster_list = ds_rdd.collect()
    cs = cs_rdd.collect()
    rs = rs_rdd.collect()
    centroids = ds_rdd.map(cluster_centroid).collect()
    std_deviations = ds_rdd.map(cluster_std_deviation).collect()
    #data = sc.parallelize(data)
    #prev_rs = sc.parallelize(rs)
    #prev_cs = sc.parallelize(cs)
    data = data.map(lambda a: (a,isSummarized2(a, centroids, std_deviations)))
    discard_set = data.filter(lambda a: a[1]!=-1)#.map(lambda a: a[0])
    leftover = data.filter(lambda a : a[1]==-1).keys().union(rs_rdd)
    newDS=[]
    newCS=[a for a in cs]
    #cluster_list = main_cluster_rdd.collect()
    #print(discard_set.take(10))
    

    ##########    INITIAL DS ASSIGNMENT
    for i in range(len(cluster_list)):
      
        toAdd = discard_set.filter(lambda a: a[1]==i).map(lambda a : a[0]).collect()
        newCluster = add_to_cluster(cluster_list[i],toAdd)
        newDS.append(newCluster)
        #print("elements in cluster", i ,"before bfr")
        #print(cluster_list[i][0])
        #print("elements in cluster", i ,"after bfr")
        #print(newCluster[0])
       
     
    print(leftover.count(), " points left to assign")
    nonDiscard = leftover.collect()
 
    bestnClusters=finding_best_k(nonDiscard)
    ######  CLUSTERING FOR BEST SIZE
 
    c_summary,c_centroids,c_std_dev = kmeans_clustering(nonDiscard, bestnClusters)
    newlyAssigned = leftover.map(lambda a: (a,isSummarized2(a, c_centroids, c_std_dev)))
    newlyAssignedSummary ={i:(0,[0 for _ in range(494)],[0 for _ in range(494)]) for i in range(bestnClusters)}
    checkMergeCentroids=[]
    checkMergeSTDD=[]
    for i in range(bestnClusters):
        ##add leftover to rs
        toAdd = newlyAssigned.filter(lambda a: a[1]==i).map(lambda a : a[0]).collect()
        if(len(toAdd)==0):
            continue
        #print(len(toAdd), " points for CS cluster ", i)
        newACluster = add_to_cluster(newlyAssignedSummary[i],toAdd)
        newlyAssignedSummary[i] = newACluster###added key
        checkMergeCentroids.append(cluster_centroid(newACluster))
        checkMergeSTDD.append(cluster_std_deviation(newACluster))
    ####check if can be merged with any old CS cluster
   
    prev_cs_centroids = cs_rdd.map(cluster_centroid).zipWithIndex()
    mergeCS = prev_cs_centroids.map(lambda a: (a[0],a[1],isSummarized2(a[0], checkMergeCentroids, checkMergeSTDD)))
    toMerge = mergeCS.filter(lambda a: a[2]!=-1).collect()
    #print(len(toMerge))
   
    mergedToCS = []
    for m in toMerge:
        if not m[1] in newCS:
            newCS[m[1]] = (0,[0 for _ in range(494)],[0 for _ in range(494)])

        newCS[m[1]] = merge_clusters(newCS[m[1]],newlyAssignedSummary[m[2]])
        mergedToCS.append(m[2])
    mergedToCS = set(mergedToCS)
    #print(mergedToCS)
    #print("MERGED TO CS ", len(mergedToCS))
    #print(newAssignedCSSummary)
    for CSC in newlyAssignedSummary:
        #print("new cs ", CSC , "has ", newlyAssignedSummary[CSC][0])
        if (not CSC in mergedToCS):
            newCS.append(newlyAssignedSummary[CSC])
    
    
    
    newCS = sc.parallelize(newCS).filter(lambda a: a[0]>0).collect()
    """
    
    currentDSCentroids = sc.parallelize(newDS).map(cluster_centroid).collect()
    currentDSSTDD = sc.parallelize(newDS).map(cluster_std_deviation).collect()
    """
    
    newRS = newlyAssigned.filter(lambda a: a[1]==-1).map(lambda a : a[0]).collect()
   
    
    return (newDS,newCS,newRS)
    

In [42]:
def finding_best_k(nonDiscard):
    bestnClusters=0
    bestAvgDensity=0
    ###### FINDING BEST CLUSTER SIZE FOR KMEANS
    for n_cs_clusters in range(1,5):
        
        kmeans = KMeans(n_clusters=n_cs_clusters).fit(nonDiscard)
        auxCS = {}
        for p in range(len(nonDiscard)):
            if not kmeans.labels_[p] in auxCS:
                auxCS[kmeans.labels_[p]]=[]
            auxCS.get(kmeans.labels_[p],[]).append(nonDiscard[p])
        avg_density=0
        for possibleCluster in auxCS:
            radius = get_cluster_radius(auxCS.get(possibleCluster),kmeans.cluster_centers_[possibleCluster])
           
            avg_density+=len(auxCS.get(possibleCluster,[]))/(radius**2)
        avg_density = avg_density/n_cs_clusters
        if avg_density>bestAvgDensity:
            bestAvgDensity=avg_density
            bestnClusters=n_cs_clusters
        #print("for ", n_cs_clusters, "avg density " ,avg_density)
        
    return bestnClusters

In [43]:
def kmeans_clustering(nonDiscard,bestnClusters):
    
    kmeans = KMeans(n_clusters=bestnClusters).fit(nonDiscard)
    c_summary = [cluster_summary(nonDiscard,kmeans.labels_,j) for j in range(bestnClusters)]
    c_centroids = [cluster_centroid(c) for c in c_summary]
    c_std_dev = [cluster_std_deviation(c) for c in c_summary]
    return c_summary, c_centroids, c_std_dev

In [44]:
def closest_cluster(point, centroids, cluster_stds):
    #print("lenpoint ", len(point))
    possible_centroids={}
    for i in range(len(centroids)):
       
        
        mahal = mahalanobis(point,centroids[i],cluster_stds[i]) 
        #print("cluster ", i, "with len ", len(centroids[i])," with std dev len " ,len(cluster_stds[i]),"with mahal ", mahal)
        
        possible_centroids[i]=mahal
    if len(possible_centroids)<1:
        return -1
    else:
        return min(possible_centroids, key=possible_centroids.get)
    

In [47]:
def final_closest_cluster(index,point, centroids, cluster_stds):
    #print("lenpoint ", len(point))
    possible_centroids={}
    for i in range(len(centroids)):
       
        
        mahal = mahalanobis(point,centroids[i],cluster_stds[i]) 
        #print("cluster ", i, "with len ", len(centroids[i])," with std dev len " ,len(cluster_stds[i]),"with mahal ", mahal)
        
        possible_centroids[i]=mahal
    if len(possible_centroids)<1:
        return -1
    else:
        return (index,min(possible_centroids, key=possible_centroids.get))
    

In [45]:
final_clusters = improv_bfr(cluster_rdd,"features.csv", 5000)

(5000, 494)
1411  points left to assign
0
RETURNED
NEW CS SIZE 4
NEW RS SIZE 326
(5000, 494)
1988  points left to assign
4
RETURNED
NEW CS SIZE 4
NEW RS SIZE 483
(5000, 494)
2725  points left to assign
4
RETURNED
NEW CS SIZE 5
NEW RS SIZE 639
(5000, 494)
2866  points left to assign
5
RETURNED
NEW CS SIZE 5
NEW RS SIZE 696
(5000, 494)
3138  points left to assign
5
RETURNED
NEW CS SIZE 5
NEW RS SIZE 819
(5000, 494)
3381  points left to assign
5
RETURNED
NEW CS SIZE 5
NEW RS SIZE 944
(5000, 494)
3346  points left to assign
5
RETURNED
NEW CS SIZE 5
NEW RS SIZE 1023
(5000, 494)
3610  points left to assign
5
RETURNED
NEW CS SIZE 6
NEW RS SIZE 1045
(5000, 494)
3565  points left to assign
6
RETURNED
NEW CS SIZE 6
NEW RS SIZE 1017
(5000, 494)
3681  points left to assign
6
RETURNED
NEW CS SIZE 6
NEW RS SIZE 1071
(5000, 494)
3995  points left to assign
6
RETURNED
NEW CS SIZE 6
NEW RS SIZE 1111
(5000, 494)
4048  points left to assign
6
RETURNED
NEW CS SIZE 6
NEW RS SIZE 1198
(5000, 494)
4162  poin

In [46]:
final_clusters_rdd = sc.parallelize(final_clusters)
final_centroids = final_clusters_rdd.map(cluster_centroid).collect()
final_stddev = final_clusters_rdd.map(cluster_std_deviation).collect()


In [91]:
def assign_points(filename, centroids, std_deviations, batchSize):
    with open(filename, mode="r") as file:
        csvFile = csv.reader(file)
        skip=0
        for line in csvFile:
            if skip==3:
                break
            skip+=1
        done=False
        results={}
        for i in range(len(centroids)):
            results[i] = []
        while not done:
        #for i in range(5):
            #print("ITERATION ", i)
            ids = []
            points = []
            #points = {}
            amount=0
            for line in csvFile:
                if amount<batchSize:
                    
                    #ids.append()    
                    #points[int(line[0])]
                    ids.append(int(line[0]))
                    p=np.array(line[1:]).astype(float)
                    points.append(p)
                    #p = np.delete(p, mask , 1)
                    #print(len(p))
                    amount+=1
                else:
                    break
            if amount < batchSize:
                done=True
            print(points)
            #for k in points:
            #    points[k] = np.delete(np.asarray(points[k]), mask, 1)
            #fpoints = [(p[0],np.delete(np.asarray(p[1]), mask, 1)) for p in points]  
            #print(fpoints)
            #print(points.shape)
            #print(mask)
            points = np.delete(points, mask, 1)
            dataMap = zip(ids, points)
            #print(points[0].shape)
            
            data = sc.parallelize(dataMap)
            print(data.take(2))
            batchResults = data.map(lambda a : final_closest_cluster(a[0],a[1], centroids, std_deviations)).map(lambda a: (a[1],a[0])).groupByKey().mapValues(list).collectAsMap()
            #done=True ########for testing
            for i in range(len(centroids)):
                results[i] = results[i] + batchResults[i]
    return results

In [92]:
res = assign_points("features.csv",final_centroids, final_stddev, 500 )

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



KeyError: 3

In [88]:
print(res)

[(0, [137, 138, 147, 151, 152, 156, 170, 172, 174, 175, 177, 246, 254, 256, 257, 273, 274, 276, 277, 281, 284, 285, 293, 294, 393, 394, 395, 399, 420, 427, 428, 429, 430, 431, 432, 433, 439, 441, 444, 446, 447, 449, 450, 452, 467, 474, 480, 550, 572, 574, 575, 578, 579, 580, 589, 606, 609, 640, 652, 692, 701, 712, 713, 721, 722, 729, 730, 731]), (1, [2, 5, 10, 20, 26, 30, 46, 48, 135, 136, 139, 140, 141, 142, 144, 145, 146, 148, 149, 150, 154, 155, 158, 159, 163, 164, 165, 166, 167, 171, 173, 176, 178, 182, 183, 184, 185, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 226, 236, 237, 238, 247, 248, 249, 250, 251, 252, 253, 255, 282, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 330, 331, 332, 333, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 36