In [1]:
import pandas as pd
import numpy as np
import random
import operator
import math
import matplotlib.pyplot as plt 
from scipy.stats import multivariate_normal

In [2]:
df_full = pd.read_csv("Data/data.csv")

In [3]:
df_full.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [4]:
columns = list(df_full.columns)
features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
df = df_full[features]
df.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,0.995,0.708,0.195,0.563,0.151,-12.428,0.0506,118.469,0.779
1,0.994,0.379,0.0135,0.901,0.0763,-28.454,0.0462,83.972,0.0767
2,0.604,0.749,0.22,0.0,0.119,-19.924,0.929,107.177,0.88
3,0.995,0.781,0.13,0.887,0.111,-14.734,0.0926,108.003,0.72
4,0.99,0.21,0.204,0.908,0.098,-16.829,0.0424,62.149,0.0693


In [5]:
# Number of Clusters
k = 10
# Maximum number of iterations
MAX_ITER = 100
# Number of data points
n = len(df)
# Fuzzy parameter
m = 1.7 #Select a value greater than 1 else it will be knn

In [6]:
def initializeMembershipMatrix(): # initializing the membership matrix
    membership_mat = []
    for i in range(n):
        num = random.randint(0, k-1);
        num_list = [0] * k;
        num_list[num] = 1;
        membership_mat.append(num_list)
    return membership_mat

In [7]:
membership_mat = initializeMembershipMatrix()

In [17]:
def calculateClusterCenter(membership_mat): # calculating the cluster center
    cluster_mem_val = list(zip(*membership_mat))
    cluster_centers = []
    for j in range(k):
        x = list(cluster_mem_val[j])
        xraised = [p ** m for p in x]
        denominator = sum(xraised)
        temp_num = []
        for i in range(n):
            data_point = list(df.iloc[i])
            prod = [xraised[i] * val for val in data_point]
            temp_num.append(prod)
        numerator = map(sum, list(zip(*temp_num)))
        center = [z/denominator for z in numerator]
        cluster_centers.append(center)
    return cluster_centers

In [18]:
cluster_centers = calculateClusterCenter(membership_mat)

In [10]:
cluster_centers

[[0.4888738785235016,
  0.5398564044810615,
  0.49198192279058844,
  0.1593861295625639,
  0.20815308517574574,
  -11.279581174796961,
  0.09210138699543573,
  116.43712251793006,
  0.5324571951870067],
 [0.49453880188557436,
  0.5368927205070049,
  0.486496698342924,
  0.16346735548404076,
  0.20534862492005354,
  -11.392864934007807,
  0.09304563637420797,
  116.88931949531926,
  0.5309076725390997],
 [0.4887527416397926,
  0.5376603606831486,
  0.49025350135554885,
  0.16220294617102643,
  0.20624992236952056,
  -11.415709005135483,
  0.09290981726979565,
  116.93659273856422,
  0.5322913985429358],
 [0.4949962671463926,
  0.5372795118212349,
  0.4873267117740676,
  0.1612280645928902,
  0.20862172041742869,
  -11.366008018395156,
  0.09275411827132876,
  116.76985519721715,
  0.5298975135310425],
 [0.4957845698480076,
  0.5378873245156036,
  0.48783799033530495,
  0.16481130645028474,
  0.20675175020303974,
  -11.359398712147547,
  0.09480116022740408,
  117.3399993038637,
  0.5333

In [11]:
def updateMembershipValue(membership_mat, cluster_centers): # Updating the membership value
    p = float(2/(m-1))
    for i in range(n):
        x = list(df.iloc[i])
        distances = [np.linalg.norm(np.array(list(map(operator.sub, x, cluster_centers[j])))) for j in range(k)]
        for j in range(k):
            den = sum([math.pow(float(distances[j]/distances[c]), p) for c in range(k)])
            membership_mat[i][j] = float(1/den)       
    return membership_mat

In [12]:
def getClusters(membership_mat): # getting the clusters
    cluster_labels = list()
    for i in range(n):
        max_val, idx = max((val, idx) for (idx, val) in enumerate(membership_mat[i]))
        cluster_labels.append(idx)
    return cluster_labels

In [25]:
def fuzzyCMeansClustering(): #Third iteration Random vectors from data
    # Membership Matrix
    membership_mat = initializeMembershipMatrix()
    curr = 0
    prev_centers = []
    while curr < MAX_ITER:
        print(curr)
        cluster_centers = calculateClusterCenter(membership_mat)
        if (cluster_centers == prev_centers):
            break
        prev_centers = cluster_centers
        membership_mat = updateMembershipValue(membership_mat, cluster_centers)
        cluster_labels = getClusters(membership_mat)
        curr += 1
    print("---------------------------")
    print("Partition matrix:")
    np_matrix = np.array(membership_mat)
    print(np_matrix)
    
    matrix_df = pd.DataFrame(data=np_matrix)
    matrix_df.to_csv("output.csv", index=False, header=False)
    clusters_df = pd.DataFrame(data=np.array(cluster_labels))
    clusters_df.to_csv("labels.csv", index=False, header=False)
    
    return cluster_labels, cluster_centers

In [26]:
fuzzyCMeansClustering()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
---------------------------
Partition matrix:


UnboundLocalError: local variable 'np_matrix' referenced before assignment