In [None]:
"""Implementation of k-means clustering algorithm.
These functions are designed to work with cartesian data points
"""
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
def convert_to_2d_array(points):
    """
    Converts `points` to a 2-D numpy array.
    """
    points = np.array(points)
    if len(points.shape) == 1:
        points = np.expand_dims(points, -1)
    return points
def visualize_clusters(clusters):
    """
    Visualizes the first 2 dimensions of the data as a 2-D scatter plot.
    """
    plt.figure()
    for cluster in clusters:
        points = convert_to_2d_array(cluster)
        if points.shape[1] < 2:
            points = np.hstack([points, np.zeros_like(points)])
        plt.plot(points[:,0], points[:,1], 'o')
    plt.show()

def DTW(points,centroid):
    errors = np.zeros((len(points),))
    for i in range(len(points)):
        errors[i]=DTWDistance(points[i],centroid)
    return errors

def DTWDistance(s1,s2):
        '''
        Calculates dynamic time warping Euclidean distance between two
        sequences. Option to enforce locality constraint for window w.
        '''
        DTW={}
        w=None
        if w:
            w = max(w, abs(len(s1)-len(s2)))

            for i in range(-1,len(s1)):
                for j in range(-1,len(s2)):
                    DTW[(i, j)] = float('inf')

        else:
            for i in range(len(s1)):
                DTW[(i, -1)] = float('inf')
            for i in range(len(s2)):
                DTW[(-1, i)] = float('inf')

        DTW[(-1, -1)] = 0

        for i in range(len(s1)):
            if w:
                for j in range(max(0, i-w), min(len(s2), i+w)):
                    dist= (s1[i]-s2[j])**2
                    DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])
            else:
                for j in range(len(s2)):
                    dist= (s1[i]-s2[j])**2
                    DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])
        #print("DWT",np.sqrt(DTW[len(s1)-1, len(s2)-1]))
        return np.sqrt(DTW[len(s1)-1, len(s2)-1])

def SSE(points):
    """
    Calculates the sum of squared errors for the given list of data points.
    """
    points = convert_to_2d_array(points)
    centroid = np.mean(points, 0)
    #errors= DTWDistance(points,centroid)
    errors = np.linalg.norm(points-centroid, ord=2, axis=1)
    #print("errors",type(errors))
    return np.sum(errors)    




def kmeans(points, k=2, epochs=1, max_iter=5, verbose=False):
    """
    Clusters the list of points into `k` clusters using k-means clustering
    algorithm.
    """
    points = convert_to_2d_array(points)
    #print("points_nit",points)
    assert len(points) >= k, "Number of data points can't be less than k"
    best_sse = np.inf
    for ep in range(epochs):
        print("epochs",ep)
        # Randomly initialize k centroids
        ##np.random.shuffle(points)
        centroids = points[0:k, :]
        last_sse = np.inf
        for it in range(max_iter):
            print("iter",it)
            # Cluster assignment
            clusters = [None] * k
            for p in points:
                
                index = np.argmin(DTW(centroids,p))
               
                
                if clusters[index] is None:
                    clusters[index] = np.expand_dims(p, 0)
                else:
                    clusters[index] = np.vstack((clusters[index], p))
            # Centroid update
            
            centroids=[]
            for c in clusters:
                if c is not None:
                    centroids.append(np.mean(c,0))
            #centroids = [np.mean(c, 0) for c in clusters]
            # SSE calculation
            #sse = np.sum([SSE(c) for c in clusters])
            ssel=[]
            for c in clusters:
                if c is not None:
                    ssel.append(SSE(c))
            sse = np.sum(ssel)
            gain = last_sse - sse
            if verbose:
                print((f'Epoch: {ep:3d}, Iter: {it:4d}, '
                       f'SSE: {sse:12.4f}, Gain: {gain:12.4f}'))
            # Check for improvement
            if sse < best_sse:
                best_clusters, best_sse = clusters, sse
        # Epoch termination condition
            if np.isclose(gain, 0, atol=0.00001):
                break
            last_sse = sse
    return best_clusters
def bisecting_kmeans(points, k, epochs=10, max_iter=100, verbose=False):
    """
    Clusters the list of points into `k` clusters using bisecting k-means
    clustering algorithm. Internally, it uses the standard k-means with k=2 in
    each iteration.
    """
    points = convert_to_2d_array(points)
    clusters = [points]
    #print("len::clusters",clusters)
    while len(clusters) < k:
        
        max_sse_i = np.argmax([SSE(c) for c in clusters])
        #print("max_sse_i",max_sse_i)
        cluster = clusters.pop(max_sse_i)
        print("len",len(clusters))
        two_clusters = kmeans(cluster, 2, epochs=epochs, max_iter=max_iter, verbose=verbose)
        clusters.extend(two_clusters)
    return clusters


In [None]:
import os
from numpy import nan
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
i=0
ccc=0
def normalise(X):
    for i in range(len(X[0])):
        v = X[:,i]
        X[:,i] = (v - v.min())/(v.max() - v.min())
    return X

user_data = []
user_level_understanding = []
for subdir, dirs, files in os.walk('DataSet_name'):
    print("user",i)
    j=0
    if i==0:
        i+=1
        continue
    labels = {}
    levels_und = {}
    attentive_sec = 0
    for file in files:
        print("video ",j)
        filepath = os.path.join(subdir, file)
        X = np.genfromtxt(filepath, delimiter=',')#,skip_header=None)
        
        X = normalise(X)
        #print(type(X))
        #print(X)
       
        y=0 # 0- not understood, 1 - understood
        #Silhouette coefficient to find optimal k value
        range_n_clusters = [2, 3, 4, 5, 6]
        optimalK = -1
        maxS = -1
        for n_clusters in range_n_clusters:

            clusterer = KMeans(n_clusters=n_clusters, random_state=10)
            cluster_labels = clusterer.fit_predict(X)
            silhouette_avg = silhouette_score(X, cluster_labels)
            print("For n_clusters =", n_clusters,
                  "The average silhouette_score is :", silhouette_avg)

            if(silhouette_avg>maxS):
                maxS=silhouette_avg
                optimalK=n_clusters

        #cluster = ts_cluster()

        #algorithm = kmeans
        algorithm = bisecting_kmeans
        k = optimalK
        verbose = False
        max_iter = 3
        epochs = 1
        clusters = algorithm(X, optimalK, verbose=verbose, max_iter=max_iter, epochs=epochs)
        
        
        centroids=[]
        for c in clusters:
            if c is not None:
                centroids.append(np.mean(c,0))
        duration = len(X)      
        print("Duration of the video:",len(X),duration)
        print("no.of clusters::",len(clusters))
        print("sizes of clusters:")
        total=0
        attentive_sec=0
        for cluster in clusters:
            points = convert_to_2d_array(cluster)
            
            #print("nit",len(points))
            #print("nit",type(points))
            #print(points)
            X = points
            fs = 512  # Sampling rate (512 Hz)

            # Get real amplitudes of FFT (only in postive frequencies)
            fft_vals = np.absolute(np.fft.rfft(X))    #these are my fft values rfft returns only the part of the result that 
                                                  #corresponds to nonpositive frequences. (Avoids complex conjugaes) 
                                                  #faster and for plotting

            # Get frequencies for amplitudes in Hz
    
            fft_freq = np.fft.rfftfreq(len(X), 1.0 / fs)     # that might be fixed (window length n , and  sample spacing)
                                                        #inverse of the sampling rate   returns sample freq of length n .

            # Define EEG bands
            eeg_bands = {'Delta': (0, 4),
                        'Theta': (4, 8),
                        'Alpha': (8, 12),
                        'Beta': (12, 30),
                        'Gamma': (30, 45)}

            # Take the mean of the fft amplitude for each EEG band
            eeg_band_fft5 = dict()
            for band in eeg_bands:
                freq_ix = np.where((fft_freq >= eeg_bands[band][0]) &   #np.where is like asking "tell me where in this array, entries satisfy a given condition".
                            (fft_freq <= eeg_bands[band][1]))[0]    #for fft_frreq at all point where it satisfies it returns the index (in array)
                                                             #if fftfreq[np.where bla bla] will give values array
                eeg_band_fft5[band] = np.mean(fft_vals[freq_ix])
    
           # Plot the data (using pandas here cause it's easy)
            print(".................ans.........................................")
            print("Beta",eeg_band_fft5['Beta'])
            print("Alpha",eeg_band_fft5['Alpha'])
            print("Theta",eeg_band_fft5['Theta'])
            
            if np.isnan(eeg_band_fft5['Beta']):
                eeg_band_fft5['Beta'] =1 
            if np.isnan(eeg_band_fft5['Alpha']) :
                eeg_band_fft5['Alpha'] =1 
            print("Beta",eeg_band_fft5['Beta'])
            print("Alpha",eeg_band_fft5['Alpha'])
            print("Theta",eeg_band_fft5['Theta'])    
            print("....................ans......................................")
            print("File_name",(eeg_band_fft5['Beta']/eeg_band_fft5['Alpha']))
            print("Duration of the video:",len(X))
            print("..........................................................")
            attention_value= (eeg_band_fft5['Beta']/eeg_band_fft5['Alpha'])
            if ((eeg_band_fft5['Beta']/eeg_band_fft5['Alpha'])>1): 
                attentive_sec+= len(points)
                #total = total + attention_value * len(points)
        #print("attentive_sec",attentive_sec) 
        print("attentive_sec",attentive_sec)
               
        if (attentive_sec >= (duration*(3/5))):
            y=1
        labels[j] = y
        print(duration)
        level_und = attentive_sec/duration
        print(level_und)
        levels_und[j] = level_und
        visualize_clusters(clusters)     

        j+=1
    i=i+1
    user_data.append(labels)
    user_level_understanding.append(levels_und)

u=1
for user in user_data:
    print("student "+str(u))
#    for k,v in user.items():
#        if v == 0:
#            user[k]="No"
#        else:
#            user[k] = "Yes"
    print(user)
    u=u+1
print("understanding level")
print(user_level_understanding)

In [None]:
import numpy as np
np.savetxt('All_with_cluster_B_1sec.csv', with_cluster_B_1sec, delimiter=",")