In [26]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_distances
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import silhouette_score
from collections import defaultdict
import mnist_reader
from collections import Counter

In [2]:
def dbscan(data, eps, minpts=3,euclidean=True):
    labels=[False for i in range(len(data))]
    clusterIndex=1
    
    if(euclidean):
        mat = euclidean_distances(data)
    else:
        mat = cosine_distances(data)
        
    for p in range(0,len(data)):                                          # Iterate over every point
        if p%1000==0: print p
        if not labels[p]:                                                 # Skip processed points
            Neighbors = np.ix_(mat[p]<eps)[0].tolist()                    # Find initial neighbors
            
            if(len(Neighbors)<minpts):                                    # Non-core points are noise
                labels[p]=-1
            else:                                  
                labels[p] = clusterIndex
                i=0
                while i < len(Neighbors):
                    if labels[Neighbors[i]] == -1:
                        labels[Neighbors[i]] = clusterIndex
                    elif not labels[Neighbors[i]]:
                        labels[Neighbors[i]] = clusterIndex
                        ExtraNeighbors = np.ix_(mat[Neighbors[i]]<eps)[0].tolist()    # Expand neighborhood
                        if(len(ExtraNeighbors)>=minpts):
                            Neighbors = np.append(Neighbors,ExtraNeighbors)
                    i=i+1
                    
                clusterIndex +=1                                                  # Start a new cluster

    return labels

In [3]:
def performance_calculator(cluster,categories):
    confusion_matrix=[]
    for i in cluster:
        countDict=defaultdict(int)
        for j in cluster[i]:
            countDict[j]+=1
    
        confusion_matrix.append([countDict[i] for i in range(categories)])
    
    
    numerator = 0
    denominator = 0
    for i in confusion_matrix:
        numerator+=max(i)
        denominator+=sum(i)
    
    purity = float(numerator)/float(denominator)
    
    gini_indexes=[]
    total=0
    for i in confusion_matrix:
        gi=1
        for j in i:
            if(sum(i)==0 and j==0):
                gi=0
            else:
                gi-=((float(j)/float(sum(i)))**2)
        gini_indexes.append(gi*sum(i))
        total+=sum(i)
        
    gini = float(sum(gini_indexes))/float(total)
    
    return (purity,gini)

In [4]:
def form_clusters(predlabel,origlabel):
    cluster=defaultdict(list)
    for i in range(len(predlabel)):
        cluster[predlabel[i]].append(origlabel[i])
    return cluster

In [5]:
def merge_clusters(cluster):
    newCluster = defaultdict(list)
    for i in cluster:
        if(len(cluster[i])>0):
            lab,_ = Counter(cluster[i]).most_common()[0]
            newCluster[lab] = newCluster[lab] + cluster[i]
    return newCluster

In [60]:
def evaluate_dbscan(data,labels,eps,minpts=3,euclidean=True, sil = False):
   
    y_pred = dbscan(data,eps,minpts,euclidean)
    
    if(sil):
        s = silhouette_score(data,y_pred)
        cluster=form_clusters(y_pred,np.array(data))
        print "Silhouette:",s
        return cluster
    else:
        cluster=form_clusters(y_pred,labels)

        noise = float(len(cluster[-1]))/float(len(data))
        del cluster[-1]
    
        p,g = performance_calculator(cluster,20)
        p,g = performance_calculator(merge_clusters(cluster),20)
        print "Noise: ",noise," Purity: ",p," Gini: ",g

# NG

In [6]:
ng = fetch_20newsgroups(subset='all')
ng_X = ng.data
ng_y = ng.target

del(ng)
count_vect = CountVectorizer(stop_words="english",min_df=3,max_df=0.5)
ng_X = count_vect.fit_transform(ng_X)

tfidf_transformer = TfidfTransformer(use_idf=True)
ng_X = tfidf_transformer.fit_transform(ng_X)

ng_X = np.asarray(ng_X.todense())

In [12]:
evaluate_dbscan(ng_X,ng_y,1.1)

Noise:  0.315504616364  Purity:  0.384418604651  Gini:  0.674553341888


In [13]:
evaluate_dbscan(ng_X,ng_y,1.2)

Noise:  0.117637695002  Purity:  0.105538517049  Gini:  0.90968598542


In [21]:
del (ng_X)
del (ng_y)

# FASHION

In [95]:
X_train, y_train = mnist_reader.load_mnist('/Users/sasankauppu/DataMining/Assignment3/data/', kind='train')
X_test, y_test = mnist_reader.load_mnist('/Users/sasankauppu/DataMining/Assignment3/data/', kind='t10k')

fashion_X = np.concatenate((X_train,X_test))
fashion_y = np.concatenate((y_train,y_test))

del(X_train)
del(X_test)
del(y_train)
del(y_test)

In [96]:
def sample_data(data,label):
    dfn=pd.DataFrame(data)
    dfn["class"]=label
    dfarr=[]
    for i in range(10):
        dfarr.append(dfn[dfn["class"]==i].sample(2000))

    dfn = pd.concat(dfarr)
    
    y_actual = dfn["class"]
    del dfn["class"]
    
    return dfn,y_actual

In [97]:
sf_X,sf_y = sample_data(fashion_X,fashion_y)

In [98]:
evaluate_dbscan(sf_X,np.array(sf_y),0.06,3,False)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
Noise:  0.3479  Purity:  0.268747124674  Gini:  0.774375048463


In [13]:
evaluate_dbscan(sf_X,np.array(sf_y),0.09,3,False)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
Noise:  0.2206  Purity:  0.129907621247  Gini:  0.88344157119


In [15]:
evaluate_dbscan(sf_X,np.array(sf_y),0.1,3,False)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
Noise:  0.19165  Purity:  0.125997402115  Gini:  0.884499928277


In [18]:
del (sf_X)
del (sf_y)
del (fashion_X)
del (fashion_y)

# Household

In [68]:
df = pd.read_csv("/Users/sasankauppu/Desktop/Data Mining CS6220/DataMining/household_power_consumption.txt",delimiter=';')

del df["Date"]
del df["Time"]

df = df.drop(df.index[np.where(np.any(np.isnan(df.convert_objects(convert_numeric=True)),axis=1))])

  


In [69]:
df = df.sample(int(0.01*len(df)))

In [70]:
df = preprocessing.scale(df, with_mean=True, with_std=True, copy=True)

In [91]:
ct=evaluate_dbscan(df,[],0.6,3,True,True)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
Silhouette: 0.155422431895


In [94]:
print "Clusters:",len(ct)

Clusters: 60


In [99]:
ct=evaluate_dbscan(df,[],0.9,3,True,True)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
Silhouette: 0.281242216427


In [100]:
print "Clusters:",len(ct)

Clusters: 32
