In [1]:
import os
import csv
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import numpy as np

In [2]:
def get_data(file_name = None):
    if not file_name:
        file_in = '../Data/%s.csv' % FILE_NAME
    else:
        file_in = file_name
    egos, typo_vector = [], {}
    nb_par_ego = {}
    with open(file_in, 'r') as to_read:
        csv_r = csv.reader(to_read, delimiter = ';')
        
        first_line = csv_r.next()
        keys_typo = [int(x.split('_')[-1]) for x in first_line[1:len(first_line)]]
        
        for line in csv_r:
            ego = line[0]
            if ego in egos:
                ego = ego + '_%s' % (nb_par_ego.get(ego, 0) + 1)
                nb_par_ego[ego] = 1 + nb_par_ego.get(ego, 0)
            egos.append(ego)
            typo_vector[ego] = {int(keys_typo[i]) : float(line[i+1]) for i in range(len(keys_typo))}
                
    return egos, typo_vector, keys_typo


def myPCA(df):
    
    # PCA
    pca = PCA(n_components = 'mle')
    pca_res = pca.fit_transform(df.values)
    
    variance_ratio = pca.explained_variance_ratio_
    components = pca.components_

    i = 0
    with open('%s/axes_pca.csv' % MAIN_DIR, 'w') as to_read:
        csv_w = csv.writer(to_read, delimiter = ';')
        for axe in components:
            csv_w.writerow([round(100*variance_ratio[i], 1)] + [round(x, 2) for x in axe])
            i += 1
            
    print components

    return pd.DataFrame(components.T, index= range(10,31))


def graphlets_clusters(data):
    for k_value in range(4,7):
        smax = 0
        clusteringmax = []
        for j in range(100):
            kmeans = KMeans(n_clusters = k_value)
            kmeans.fit(data)
            s = silhouette_score(data, kmeans.labels_)
            if s > smax:
                smax = s
                clusteringmax = kmeans.labels_
        print smax
        for label in range(k_value):
            for i in range(10,31):
                if clusteringmax[i-10] == label:
                    print '\motif{%s}' % (i),
            if label < k_value -1:
                print ' - ',
        print
        #print kmeans.labels_
        #print type(kmeans.labels_)
        #print smax
        #print clusteringmax

In [33]:
corpus = 'facebook'
k = 5

FILE_NAME = '%s_k%s' % (corpus, k)
MAIN_DIR = '../Results/%s' % FILE_NAME
print MAIN_DIR

if not FILE_NAME in os.listdir('../Results/'):
    os.mkdir(MAIN_DIR)

EGOS, TYPO_VECTOR, KEYS_TYPO = get_data()
data = TYPO_VECTOR    
    
new_data = {}
for ego in data:
    new_data[ego] = {graphlet : data[ego][graphlet] if data[ego][graphlet] < 1 else \
                         2-1/data[ego][graphlet] for graphlet in KEYS_TYPO}    
    
DATA_CALC = pd.DataFrame.from_dict(new_data, orient = 'index')
graphlets_clusters(DATA_CALC.T)
#graphlets_clusters(pd.DataFrame.from_dict(data, orient = 'index').T)

../Results/facebook_k5
0.281095047459
\motif{11} \motif{15}  -  \motif{16} \motif{17} \motif{21} \motif{22} \motif{26}  -  \motif{18} \motif{19} \motif{20} \motif{23} \motif{24} \motif{25} \motif{27} \motif{28} \motif{29} \motif{30}  -  \motif{10} \motif{12} \motif{13} \motif{14}
0.264982552532
\motif{16} \motif{17} \motif{21} \motif{22} \motif{26}  -  \motif{14} \motif{18} \motif{19} \motif{20}  -  \motif{23} \motif{24} \motif{25} \motif{27} \motif{28} \motif{29} \motif{30}  -  \motif{11} \motif{15}  -  \motif{10} \motif{12} \motif{13}
0.284455492635
\motif{14} \motif{18} \motif{19} \motif{20}  -  \motif{11} \motif{15}  -  \motif{23} \motif{24} \motif{25} \motif{27} \motif{28} \motif{29} \motif{30}  -  \motif{16} \motif{17} \motif{21} \motif{22} \motif{26}  -  \motif{10} \motif{13}  -  \motif{12}


In [None]:
../Results/facebook_k5
0.281095047459
\motif{18} \motif{19} \motif{20} \motif{23} \motif{24} \motif{25} \motif{27} \motif{28} \motif{29} \motif{30}  -  \motif{10} \motif{12} \motif{13} \motif{14}  -  \motif{16} \motif{17} \motif{21} \motif{22} \motif{26}  -  \motif{11} \motif{15}  - 
0.264982552532
\motif{16} \motif{17} \motif{21} \motif{22} \motif{26}  -  \motif{10} \motif{12} \motif{13}  -  \motif{23} \motif{24} \motif{25} \motif{27} \motif{28} \motif{29} \motif{30}  -  \motif{11} \motif{15}  -  \motif{14} \motif{18} \motif{19} \motif{20}  - 
0.266193995491
\motif{11} \motif{15}  -  \motif{24} \motif{25} \motif{27} \motif{28} \motif{29}  -  \motif{16} \motif{17} \motif{21} \motif{22} \motif{26}  -  \motif{10} \motif{12} \motif{13} \motif{14}  -  \motif{30}  -  \motif{18} \motif{19} \motif{20} \motif{23}  - 
