In [1]:
import numpy as np
from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import csv
import pandas
from os import mkdir, path

In [3]:
import numpy as np
import random

#Bauckhage C. Numpy/scipy Recipes for Data Science: k-Medoids Clustering[R]. Technical Report, University of Bonn, 2015.

def kMedoids(D, k, tmax=10000):
    # determine dimensions of distance matrix D
    m, n = D.shape

    if k > n:
        raise Exception('too many medoids')
    # randomly initialize an array of k medoid indices
    M = np.arange(n)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # create a copy of the array of medoid indices
    Mnew = np.copy(M)

    # initialize a dictionary to represent clusters
    C = {}
    for t in xrange(tmax):
        # determine clusters, i. e. arrays of data indices
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # check for convergence
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        # final update of cluster memberships
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]

    # return results
    return M, C

In [2]:
def get_data(file_name = None):
    file_in = '../Data/%s.csv' % file_name
    egos, typo_vector = [], {}
    nb_par_ego = {}
    with open(file_in, 'r') as to_read:
        csv_r = csv.reader(to_read, delimiter = ';')
        
        first_line = csv_r.next()
        keys_typo = [int(x.split('_')[-1]) for x in first_line[1:len(first_line)]]
        
        for line in csv_r:
            ego = line[0]
            if ego in egos:
                ego = ego + '_%s' % (nb_par_ego.get(ego, 0) + 1)
                nb_par_ego[ego] = 1 + nb_par_ego.get(ego, 0)
            egos.append(ego)
            typo_vector[ego] = {int(keys_typo[i]) : float(line[i+1]) for i in range(len(keys_typo))}
                
    return egos, typo_vector, keys_typo

def get_overall_representativity():
    s = {x : 0 for x in INDICS} 
    
    for ego in EGOS:    
        for graphlet in DATA_MOTIFS[ego]:
            s[graphlet] += int(DATA_MOTIFS[ego][graphlet])
    
    all_graphlets = sum(s.values())
    return {x : s[x]/float(all_graphlets) for x in s}


def get_classe_representativity(list_of_egos):
    s = {x : 0 for x in INDICS}    
    for ego in list_of_egos:
        for graphlet in DATA_MOTIFS[ego]:
            s[graphlet] += int(DATA_MOTIFS[ego][graphlet])

    all_graphlets = float(sum(s.values()))
    local_representativities = {x : s[x]/all_graphlets for x in s}
    
    return {x : local_representativities[x] / float(OVERALL_REPR[x]) \
                if local_representativities[x] / float(OVERALL_REPR[x]) < 1
                else
                    2 - 1 / (local_representativities[x] / float(OVERALL_REPR[x]))
                for x in local_representativities}

In [5]:
metric = 'gcd58'
metric = 'gcd15'
metric = 'rgf'
metric = 'gdda'
metric = 'gcd11'
metric = 'gcd73'

EGOS, DATA_MOTIFS, INDICS = get_data(file_name = 'motifs_facebook_k5')

data = pandas.read_csv('../Data/Yaveroglu_indics/%s.txt' % metric, sep = '\t', index_col = 0)

#CSA restriction

EGOS, DATA_MOTIFS, INDICS = get_data(file_name = 'motifs_facebook_csa_k5')
corpus_per_ego = {}
with open('../Data/corpus_per_ego.csv', 'r') as to_read:
    csv_r = csv.reader(to_read, delimiter = ';')
    for line in csv_r:
        corpus_per_ego[line[0]] = line[1]
        
list_csa = [ego for ego in corpus_per_ego if corpus_per_ego[ego] == 'csa']

data_csa = pandas.DataFrame(index = ['../Data/Yaveroglu_files/%s' % ego for ego in list_csa],
                           columns = ['../Data/Yaveroglu_files/%s' % ego for ego in list_csa])
for ego1 in list_csa:
    df_ego1 = '../Data/Yaveroglu_files/%s' % ego1
    for ego2 in list_csa:
        df_ego2 = '../Data/Yaveroglu_files/%s' % ego2
        data_csa[df_ego1][df_ego2] = data[df_ego1][df_ego2]
        
#data = data_csa
#metric = '%s_csa' % metric

#FIN CSA restriction
    

#this_range = range(2, 8)
this_range = [5]

for nb_clusters in this_range:
    ac = AgglomerativeClustering(n_clusters = nb_clusters)
    clusters = ac.fit(data)
    
matrix_clusters = []
for i in range(nb_clusters):
    matrix_clusters.append([])

i = 0
for label in clusters.labels_:
    matrix_clusters[label].append(data.index[i].split('/')[-1])
    i += 1

i = 0    
if not path.isdir('../Results/Yaveroglu/%s' % metric):
    mkdir('../Results/Yaveroglu/%s' % metric)
    
for cluster in matrix_clusters:
    with open('../Results/Yaveroglu/%s/egos_classe_%s.csv' % (metric, i), 'w') as to_write:
        for ego in cluster:
            to_write.write(ego+'\n')
            EGOS.append(ego)
        i += 1
    
    
OVERALL_REPR = get_overall_representativity()
with open('../Results/Yaveroglu/%s.csv' % metric, 'w') as to_write:
    csv_w = csv.writer(to_write, delimiter = ';')
    csv_w.writerow(['classe', 'nb']+['']+INDICS)
    i = 0
    for cluster in matrix_clusters:
        classe_repr = get_classe_representativity(cluster)
        csv_w.writerow([i, len(cluster)]+['']+[classe_repr[g] for g in INDICS])
        i += 1


In [23]:
metric = 'gcd11'
metric = 'gcd58'
metric = 'gdda'
metric = 'rgf'

data = pandas.read_csv('../Data/Yaveroglu_indics/%s.txt' % metric, sep = '\t', index_col = 0)
M, C = kMedoids(np.array(data), 6)

OVERALL_REPR = get_overall_representativity()

matrix_clusters = []
for cluster in C.values():
    matrix_clusters.append([data.index[i].split('/')[-1] for i in cluster])
    
with open('../Results/Yaveroglu/%s.csv' % metric, 'w') as to_write:
    csv_w = csv.writer(to_write, delimiter = ';')
    csv_w.writerow(['classe', 'nb']+['']+INDICS)
    i = 0
    for cluster in matrix_clusters:
        classe_repr = get_classe_representativity(cluster)
        csv_w.writerow([i, len(cluster)]+['']+[classe_repr[g] for g in INDICS])
        i += 1
        

KeyboardInterrupt: 

In [8]:
sum_per_c = {}

for i in range(5):
    sum_per_c[i] = {j : 0 for j in range(10,31)}
    with open('../Results/Yaveroglu/gcd73_csa/egos_classe_%s.csv' % i, 'r') as to_read:
        csv_r = csv.reader(to_read, delimiter = ';')
        for line in csv_r:
            for j in range(10,31):
                sum_per_c[i][j] += DATA_MOTIFS[line[0]][j]

In [11]:
prop_per_c = {}
for i in range(5):
    prop_per_c[i] = {j : 0 for j in range(10,31)}
    for j in range(10,31):
        prop_per_c[i][j] = sum_per_c[j] / float(sum(sum_per_c[i][k] for k in range(10,31)))

prop_per_c

KeyError: 10

In [57]:
g

{10: 0.06412812795768298,
 11: 0.04820899807006561,
 12: 0.11394199176106903,
 13: 0.07596825045232677,
 14: 0.07571021019960403,
 15: 0.0892004398794107,
 16: 0.002001147227282755,
 17: 0.02114854107604261,
 18: 0.06109252703977707,
 19: 0.09164958738790016,
 20: 0.021479556231607785,
 21: 0.015274831559327557,
 22: 0.0015515306919780326,
 23: 0.07022461736107595,
 24: 0.051325786414754875,
 25: 0.015906073305004376,
 26: 0.00839980380081857,
 27: 0.010485276442677518,
 28: 0.0735600876178066,
 29: 0.05315993288826496,
 30: 0.03558268263552205}

In [21]:
def prop_graphlets(list_egos):
    with open('../Data/motifs_facebook_k5.csv', 'r') as to_read:
        csv_r = csv.reader(to_read, delimiter = ';')
        list_graphlets = [int(x.split('_')[2]) for x in csv_r.next()[1:]]
        nb_per_graphlet = {x : 0 for x in list_graphlets}
        for line in csv_r:
            if not line[0] in list_egos:
                continue
            for i in list_graphlets:
                nb_per_graphlet[i] += int(line[i-9])
    sum_nb_graphlets = float(sum(nb_per_graphlet.values()))
    return {x : nb_per_graphlet[x]/sum_nb_graphlets for x in list_graphlets}

for cluster in range(5):
    list_egos = []
    with open('../Results/Yaveroglu/gcd73/egos_classe_%s.csv' % cluster) as to_read:
        csv_r = csv.reader(to_read, delimiter = ';')
        for line in csv_r:
            list_egos.append(line[0])
    print prop_graphlets(list_egos).values()
    print
        

[0.0586707207582327, 0.06093408592370396, 0.11567430227621736, 0.07935425546666801, 0.07473416668394475, 0.1033948000706457, 0.0017062772876947606, 0.01918472348672884, 0.060828860703243184, 0.09518728815191496, 0.02467372661281968, 0.01409983836842904, 0.0014104021604991524, 0.07250676736735046, 0.04926874798939924, 0.015935499394641152, 0.007537868943640883, 0.008927557604429034, 0.06695584544534312, 0.04270866868854809, 0.026305596615905934]

[0.0674648396503298, 0.06456785253141764, 0.10982063310580552, 0.08210706574334992, 0.08200789489594734, 0.09317199072223142, 0.002804905893162619, 0.02593959155988318, 0.06875213960779283, 0.09297426651060249, 0.022498167237512075, 0.020140784182136252, 0.0018279407828755113, 0.0630127707031951, 0.05581591280521593, 0.01458282573611637, 0.009875324581746817, 0.010426798044115178, 0.06148296756895265, 0.03534379934791185, 0.015381528789699509]

[0.08618471061830406, 0.04647994820838314, 0.14368435983824565, 0.08671152662115245, 0.08812949551962