### Implementation of Stream Clustering
### Dataset: 300 Features and 400 Data Samples

In [1]:
# Imports

import pandas as pd
import numpy as np
import random
import math
import sys
import matplotlib.pyplot as plt
from pprint import pprint
import seaborn as sns
from scipy.spatial import distance
from sklearn.datasets import (make_blobs,make_circles,make_moons)
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [95]:
# To get demographics of data
train_data = pd.read_excel('Shared for Clustering.xlsx', "Training", header=None)
validation_data = pd.read_excel('Shared for Clustering.xlsx', "Validation", header=None)
print("Train Data: ", train_data.shape)
print("Validation Data: ", validation_data.shape)
train_data.to_csv("Train.csv", index=None, header=False)
validation_data.to_csv("Validation.csv", index=None, header=False)

Train Data:  (400, 300)
Validation Data:  (60, 300)


In [96]:
# Stream Generator function
# Total 40 chunks, each of size 10

def generateStream(chunk_size = 10):
    count = 0
    for chunk in pd.read_csv('Train.csv', header=None, chunksize=chunk_size):
        chunk_array = chunk.values
        yield chunk_array
        

In [97]:
# Define My Clustering Function
from sklearn.metrics.pairwise import pairwise_distances
from numpy.random import choice
from numpy.random import seed
from pyclustering.cluster.kmeans import kmeans

def KMeans(data, k=2):
    
    seed(1)
    random_rows = choice(len(data), size=k, replace=False)
    random_centroids = data[random_rows, :]
    
    k_means = kmeans(data, random_centroids)
    k_means.process()
    clusters = k_means.get_clusters()
    return clusters

In [111]:
def calculateCentroids(data, cluster, k):
    
    centroids = []
    for i in range(0, k):
        m = np.mean(data[cluster[i]], axis = 0)
        centroids.append(m)
    
    return centroids

In [112]:
# Clustering on stream data

## Getting stream data
sdata = generateStream(10)  # <- Yeilds 40 chunks each of size 10

# initializing other parameters
k = 2
level1=[]
count1 = 0
level2 = []
count2 = 0
level3 = []
count3 = 0
cluster_weightage = [0, 0]

for data in sdata:
    cluster = KMeans(data, k)
    centroids = calculateCentroids(data, cluster, k)
    for tmp in range(0, k):
        level1.append(centroids[tmp])
    count1 = count1 + 1
    
    if count1==5:
        data2 = np.array(level1)
        #print(data1)
        cluster2 = KMeans(data2, k)
        centroids2 = calculateCentroids(data2, cluster2, k)
        for tmp2 in range(0, k):
            level2.append(centroids2[tmp2])
        count1 = 0
        count2 = count2 + 1
        level1 = []
        
        if count2==4:
            data3 = np.array(level2)
            cluster3 = KMeans(data3, k)
            centroids3 = calculateCentroids(data3, cluster3, k)
            for tmp3 in range(0, k):
                level3.append(centroids3[tmp3])
            count2 = 0
            count3 = count3 + 1
            level2 = []
            

# Generating final cluster
data4 = np.array(level3)
cluster4 = KMeans(data4, k)
centroids4 = calculateCentroids(data4, cluster4, k)
final_centroids = []
for tmp4 in range(0, k):
    final_centroids.append(centroids4[tmp4])
        
# print("Final Centroids: ")
# pprint(final_centroids)

In [113]:
from scipy.spatial import distance
def calculateDistance(final_centroids, data_row):
    
    op = -1
    d = 0
    min_dist = sys.float_info.max
    for row in range(0, k):
        d = distance.euclidean(data_row, final_centroids[row])
        if d < min_dist:
            min_dist = d
            op = row
            
    return op+1
            

In [118]:
# Validation

def validate(final_centroids):
    
    validation_data = pd.read_csv("Validation.csv", header=None)
    print(validation_data.shape)
    
    output_class = []
    data_row = validation_data.values
    for val in range(0, validation_data.shape[0]):
        output_class.append(calculateDistance(final_centroids, data_row[val, :]))
        
    print("Output Class for Validation Data:")
    print(output_class)

In [119]:
# Validate
validate(final_centroids)

(60, 300)
Output Class for Validation Data:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [66]:
# Create Predict Function

def predict(final_centroids):
    
    # Take 
    

array([[1, 2],
       [3, 4],
       [5, 6]])
<class 'numpy.ndarray'>


In [106]:
a = [[2], [3, 4]]
print(len(a[1]))

2
