# Analyse Cluster

The aim of this notebook is to analyse the clustering results in more detail and compare the clusters

### imports and preprocessing

In [2]:
import pickle
import json
import pandas as pd
import numpy as np


In [3]:
specs = dict()

with open('obj/specjson210518.pkl', 'rb') as handle:
    specs = pickle.load(handle)

keys = list(specs.keys())
print(keys)
len(keys)

['1forge.com', '6-dot-authentiqio.appspot.com', 'adafruit.com', 'adobe.com:aem', 'adyen.com:CheckoutService', 'adyen.com:PaymentService', 'adyen.com:RecurringService', 'afterbanks.com', 'agco-ats.com', 'airport-web.appspot.com', 'amadeus.com', 'amazonaws.com:AWSMigrationHub', 'amazonaws.com:acm', 'amazonaws.com:acm-pca', 'amazonaws.com:alexaforbusiness', 'amazonaws.com:apigateway', 'amazonaws.com:application-autoscaling', 'amazonaws.com:appstream', 'amazonaws.com:appsync', 'amazonaws.com:athena', 'amazonaws.com:autoscaling', 'amazonaws.com:autoscaling-plans', 'amazonaws.com:batch', 'amazonaws.com:budgets', 'amazonaws.com:ce', 'amazonaws.com:cloud9', 'amazonaws.com:clouddirectory', 'amazonaws.com:cloudformation', 'amazonaws.com:cloudfront', 'amazonaws.com:cloudhsm', 'amazonaws.com:cloudhsmv2', 'amazonaws.com:cloudsearch', 'amazonaws.com:cloudsearchdomain', 'amazonaws.com:cloudtrail', 'amazonaws.com:codebuild', 'amazonaws.com:codecommit', 'amazonaws.com:codedeploy', 'amazonaws.com:codepi

1034

## length of the specs

In [21]:
specs_string = dict()

for spec in specs:
    str_spec = json.dumps(specs[spec])
    specs_string[spec] = str_spec
print(len(specs_string.keys()))

1034


In [24]:
with open('obj/specs_string.pkl', 'wb') as handle:
    pickle.dump(specs_string, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
spec_len = list()
for spec in specs_string:
    length = len(specs_string[spec])
    spec_len.append(length)
print(len(spec_len))

1034


In [32]:
#minimal length of spec
min(spec_len)

1362

In [33]:
#maximal length of spec
max(spec_len)

3259941

In [37]:
#mean length of spec
np.mean(spec_len)

77842.50580270794

In [38]:
#median length of spec
np.median(spec_len)

30538.5

In [39]:
#mode of list
max(set(spec_len), key=spec_len.count)

2883

In [34]:
with open('obj/spec_len.pkl', 'wb') as handle:
    pickle.dump(spec_len, handle, protocol=pickle.HIGHEST_PROTOCOL)

## prepare dataframes for extensiveness and structure

In [40]:
pathdf = pd.read_csv('PAMpath0518.csv',index_col='X')
pathdf = pathdf.iloc[:,1:]
pathdf['spec_len'] = spec_len
pathdf.head()

Unnamed: 0_level_0,paths,operations,pam_fit$clustering,spec_len
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1forge.com,2,2,1,1362
6-dot-authentiqio.appspot.com,5,14,2,13452
adafruit.com,34,69,3,65937
adobe.com:aem,27,31,4,22375
adyen.com:CheckoutService,2,2,1,32474


In [48]:
structdf = pd.read_csv('PAMcomplete0518.csv',index_col='X')
structdf = structdf.iloc[:,1:]
structdf['spec_len'] = spec_len
structdf.head()

Unnamed: 0_level_0,swagger,info,host,basePath,schemes,consumes,produces,paths,definitions,parameters,...,tags,externalDocs,title,description,termsOfService,contact,license,version,pam_fit$clustering,spec_len
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1forge.com,1,1,1,1,1,0,1,1,0,0,...,0,0,1,1,0,1,0,1,1,1362
6-dot-authentiqio.appspot.com,1,1,1,1,1,0,0,1,1,1,...,0,0,1,1,1,1,1,1,2,13452
adafruit.com,1,1,1,1,1,0,1,1,1,1,...,0,0,1,1,0,0,0,1,3,65937
adobe.com:aem,1,1,1,1,1,0,0,1,1,0,...,0,0,1,1,0,1,0,1,3,22375
adyen.com:CheckoutService,0,1,0,0,0,0,0,1,0,0,...,0,0,1,1,0,1,0,1,1,32474


## Paths Clustering Results

In [6]:
# cluster of path
pathdf['pam_fit$clustering'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18], dtype=int64)

Plot of Cluster:

![Plot of 18 Cluster](PlotPath0518.png)

Similarity: (first two most similar, last most dissimilar)

In [49]:
SimPathdf = pd.read_csv('Pathsimilar.csv',index_col='X')
SimPathdf = SimPathdf.iloc[:,1:]
SimPathdf.head()

Unnamed: 0_level_0,paths,operations
X,Unnamed: 1_level_1,Unnamed: 2_level_1
googleapis.com:games,52,105
amazonaws.com:autoscaling,52,104
kubernetes.io,488,1381
amazonaws.com:entitlement.marketplace,1,2


Distribution of clusters across the dataset:

In [50]:
#daat.YEARMONTH.value_counts()
pathdf['pam_fit$clustering'].value_counts()

8     102
1      91
4      87
9      86
13     82
12     72
2      70
3      64
15     62
5      62
14     57
6      48
17     43
11     39
10     36
7      18
16     14
18      1
Name: pam_fit$clustering, dtype: int64

In [51]:
#TODO: Add Charts of correlation


In [52]:
pathdf.groupby('pam_fit$clustering').min()

Unnamed: 0_level_0,paths,operations,spec_len
pam_fit$clustering,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2,1362
2,5,13,8700
3,22,49,29567
4,9,24,10787
5,3,7,4410
6,1,3,3210
7,87,116,97302
8,1,1,1446
9,5,18,10124
10,32,70,34509


In [11]:
pathdf.groupby('pam_fit$clustering').aggregate(['min', np.median, np.mean, max])

Unnamed: 0_level_0,paths,paths,paths,paths,operations,operations,operations,operations
Unnamed: 0_level_1,min,median,mean,max,min,median,mean,max
pam_fit$clustering,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,1,2.0,1.714286,2,2,2.0,2.0,2
2,5,11.0,10.857143,17,13,15.0,15.142857,17
3,22,34.0,35.375,68,49,62.5,62.921875,85
4,9,16.0,18.597701,32,24,29.0,29.16092,35
5,3,5.5,5.725806,9,7,8.0,7.983871,9
6,1,3.0,2.666667,3,3,3.0,3.0,3
7,87,116.0,116.833333,157,116,190.5,202.333333,321
8,1,1.0,1.0,1,1,1.0,1.0,1
9,5,12.0,12.72093,23,18,20.0,20.267442,25
10,32,44.5,47.333333,67,70,95.5,93.888889,132


In [12]:
pathdf.groupby('pam_fit$clustering').describe()

Unnamed: 0_level_0,operations,operations,operations,operations,operations,operations,operations,operations,paths,paths,paths,paths,paths,paths,paths,paths
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
pam_fit$clustering,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
1,91.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0,91.0,1.714286,0.454257,1.0,1.0,2.0,2.0,2.0
2,70.0,15.142857,1.332816,13.0,14.0,15.0,16.0,17.0,70.0,10.857143,3.696005,5.0,8.0,11.0,14.0,17.0
3,64.0,62.921875,7.820008,49.0,57.0,62.5,68.0,85.0,64.0,35.375,10.00714,22.0,28.75,34.0,39.25,68.0
4,87.0,29.16092,2.880865,24.0,27.0,29.0,31.5,35.0,87.0,18.597701,6.204761,9.0,14.0,16.0,24.0,32.0
5,62.0,7.983871,0.819675,7.0,7.0,8.0,9.0,9.0,62.0,5.725806,1.757134,3.0,4.0,5.5,7.0,9.0
6,48.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0,48.0,2.666667,0.595491,1.0,2.0,3.0,3.0,3.0
7,18.0,202.333333,57.619441,116.0,172.75,190.5,237.75,321.0,18.0,116.833333,20.855949,87.0,101.0,116.0,123.75,157.0
8,102.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,102.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
9,86.0,20.267442,1.830618,18.0,19.0,20.0,22.0,25.0,86.0,12.72093,4.608563,5.0,9.0,12.0,16.0,23.0
10,36.0,93.888889,13.698337,70.0,83.5,95.5,100.0,132.0,36.0,47.333333,9.183837,32.0,40.0,44.5,52.25,67.0


In [81]:
#Determine pivot table
impute_grps = pathdf.pivot_table(values=["operations","paths"], index=["pam_fit$clustering"], aggfunc=np.mean)
print(impute_grps) 

                     operations       paths
pam_fit$clustering                         
1                      2.000000    1.714286
2                     15.142857   10.857143
3                     62.921875   35.375000
4                     29.160920   18.597701
5                      7.983871    5.725806
6                      3.000000    2.666667
7                    202.333333  116.833333
8                      1.000000    1.000000
9                     20.267442   12.720930
10                    93.888889   47.333333
11                   128.871795   76.128205
12                    40.347222   24.750000
13                    11.000000    7.158537
14                     6.070175    3.912281
15                     4.000000    2.870968
16                   346.785714  226.285714
17                     5.000000    3.348837
18                  1381.000000  488.000000


In [82]:
impute_grps.sort_values('operations')

Unnamed: 0_level_0,operations,paths
pam_fit$clustering,Unnamed: 1_level_1,Unnamed: 2_level_1
8,1.0,1.0
1,2.0,1.714286
6,3.0,2.666667
15,4.0,2.870968
17,5.0,3.348837
14,6.070175,3.912281
5,7.983871,5.725806
13,11.0,7.158537
2,15.142857,10.857143
9,20.267442,12.72093


In [83]:
mean_len = pathdf[["pam_fit$clustering","spec_len"]]
mean_len = mean_len.groupby('pam_fit$clustering').mean().astype(int)
mean_len

Unnamed: 0_level_0,spec_len
pam_fit$clustering,Unnamed: 1_level_1
1,10162
2,42860
3,123248
4,70847
5,29687
6,29879
7,417302
8,13946
9,56065
10,190372


In [87]:
med_len = pathdf[["pam_fit$clustering","spec_len"]]
med_len = med_len.groupby('pam_fit$clustering').median()
med_len

Unnamed: 0_level_0,spec_len
pam_fit$clustering,Unnamed: 1_level_1
1,7054.0
2,31438.5
3,111714.0
4,62838.0
5,22973.5
6,10685.0
7,454998.0
8,4049.5
9,42639.0
10,179097.0


In [88]:
impute_grps = impute_grps.sort_values('paths')
impute_grps['count'] = pathdf['pam_fit$clustering'].value_counts()
impute_grps['length'] = mean_len
impute_grps['length_med'] = med_len
impute_grps

Unnamed: 0_level_0,operations,paths,count,length,length_med
pam_fit$clustering,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,1.0,1.0,102,13946,4049.5
1,2.0,1.714286,91,10162,7054.0
6,3.0,2.666667,48,29879,10685.0
15,4.0,2.870968,62,16386,12979.0
17,5.0,3.348837,43,25400,17029.0
14,6.070175,3.912281,57,21770,19657.0
5,7.983871,5.725806,62,29687,22973.5
13,11.0,7.158537,82,44507,29489.5
2,15.142857,10.857143,70,42860,31438.5
9,20.267442,12.72093,86,56065,42639.0


Now lets look into certain clusters:

In [86]:
cluster_n =  pathdf['pam_fit$clustering']==8
pathdf[cluster_n]

Unnamed: 0_level_0,paths,operations,pam_fit$clustering,spec_len
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
airport-web.appspot.com,1,1,8,1446
apimatic.io,1,1,8,2483
apitore.com:2chMatomeFeedsApis,1,1,8,2906
apitore.com:certificateFeedsApis,1,1,8,2946
apitore.com:designFeedsApis,1,1,8,2901
apitore.com:documentFrequencyApis,1,1,8,2258
apitore.com:gameFeedsApis,1,1,8,2883
apitore.com:japaneseWordnetApis,1,1,8,2765
apitore.com:kmeansClusteringByWord2vec,1,1,8,3164
apitore.com:newsFeedsApis,1,1,8,2883


## Structure Clustering Results

In [6]:
# cluster of path
structdf['pam_fit$clustering'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8], dtype=int64)

Plot of Clusters:

K = 8
![Plot of 8 Cluster](PlotStruct0518.png)

K = 4
![Plot of 4 Cluster](plotStruct40518.png)

K = 20
![Plot of 20 Cluster](plotStruct200518.png)

Similarity: (first two most similar, last most dissimilar)

In [93]:
SimStructdf = pd.read_csv('Structsimilar.csv',index_col='X')
SimStructdf = SimStructdf.iloc[:,1:]
SimStructdf.head()

Unnamed: 0_level_0,swagger,info,host,basePath,schemes,consumes,produces,paths,definitions,parameters,...,securityDefinitions,security,tags,externalDocs,title,description,termsOfService,contact,license,version
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
landregistry.gov.uk:deed,1,1,1,1,1,0,1,1,1,0,...,0,0,0,0,1,1,0,1,0,1
1forge.com,1,1,1,1,1,0,1,1,0,0,...,0,0,0,0,1,1,0,1,0,1
brex.io,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,1
amazonaws.com:kinesisanalytics,1,1,1,1,1,1,1,1,1,1,...,1,1,0,1,1,0,1,1,1,1


Distribution of clusters across the dataset:

In [94]:
structdf['pam_fit$clustering'].value_counts()

7    222
5    175
3    149
8    125
4    109
1    101
6     98
2     55
Name: pam_fit$clustering, dtype: int64

In [95]:
#TODO: Add Charts of correlation


In [108]:
display(structdf.groupby('pam_fit$clustering').median())

Unnamed: 0_level_0,swagger,info,host,basePath,schemes,consumes,produces,paths,definitions,parameters,...,security,tags,externalDocs,title,description,termsOfService,contact,license,version,spec_len
pam_fit$clustering,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,24779.0
2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29518.0
3,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,20280.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,8700.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,85534.0
6,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,26579.0
7,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,24208.5
8,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,46612.0


In [97]:
structdf.groupby('pam_fit$clustering').min()

Unnamed: 0_level_0,swagger,info,host,basePath,schemes,consumes,produces,paths,definitions,parameters,...,security,tags,externalDocs,title,description,termsOfService,contact,license,version,spec_len
pam_fit$clustering,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,1362
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,1,0,0,0,1,3636
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,1540
4,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,1435
5,1,1,1,0,1,1,1,1,0,0,...,0,0,0,1,0,0,0,0,1,2483
6,1,1,1,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,1529
7,1,1,1,0,1,0,0,1,1,0,...,0,0,0,1,1,0,0,0,1,3147
8,1,1,1,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,1526


In [106]:
stat_struct = structdf[["pam_fit$clustering","spec_len"]]
stat_struct = stat_struct.groupby('pam_fit$clustering').median()
stat_struct

Unnamed: 0_level_0,spec_len
pam_fit$clustering,Unnamed: 1_level_1
1,24779.0
2,29518.0
3,20280.0
4,8700.0
5,85534.0
6,26579.0
7,24208.5
8,46612.0


In [115]:
# medoidscomplete0518.csv
medoidpathdf = pd.read_csv('medoidscomplete0518.csv',index_col='X')
medoidpathdf = medoidpathdf.iloc[:,1:]
#pathdf['spec_len'] = spec_len
with pd.option_context('display.max_rows', None, 'display.max_columns', 30):
    print(medoidpathdf)


                                 swagger  info  host  basePath  schemes  \
X                                                                         
weatherbit.io                          1     1     1         1        1   
googleapis.com:adexchangebuyer2        1     1     1         1        1   
voodoomfg.com                          1     1     1         1        1   
neowsapp.com                           1     1     1         1        1   
amazonaws.com:xray                     1     1     1         1        1   
azure.com:automation-account           1     1     1         0        1   
windows.net:graphrbac                  1     1     1         0        1   
googleapis.com:youtubereporting        1     1     1         1        1   

                                 consumes  produces  paths  definitions  \
X                                                                         
weatherbit.io                           0         1      1            1   
googleapis.com:adexchang

In [119]:
medoids = list(medoidpathdf.index.values)
print(type(medoids))
medoids

<class 'list'>


['weatherbit.io',
 'googleapis.com:adexchangebuyer2',
 'voodoomfg.com',
 'neowsapp.com',
 'amazonaws.com:xray',
 'azure.com:automation-account',
 'windows.net:graphrbac',
 'googleapis.com:youtubereporting']

In [124]:
medoids_df = structdf.loc[medoids]
medoids_df = medoids_df[["pam_fit$clustering", "spec_len"]]
medoids_df

Unnamed: 0_level_0,pam_fit$clustering,spec_len
X,Unnamed: 1_level_1,Unnamed: 2_level_1
weatherbit.io,1,121837
googleapis.com:adexchangebuyer2,2,84758
voodoomfg.com,3,23647
neowsapp.com,4,9342
amazonaws.com:xray,5,38236
azure.com:automation-account,6,67102
windows.net:graphrbac,7,61023
googleapis.com:youtubereporting,8,25702


In [128]:
lengths_struct = []
for i in range(1,9):
    cluster_n =  structdf['pam_fit$clustering']== i
    clust_df = structdf[cluster_n]
    mean = clust_df["spec_len"].mean().astype(int)
    median = clust_df["spec_len"].median()
    print("Cluster {} has a spec length mean of {} and a median of {}".format(i, mean, median))
    lengths_struct.append([i, mean, median])

Cluster 1 has a spec length mean of 79640 and a median of 24779.0
Cluster 2 has a spec length mean of 69359 and a median of 29518.0
Cluster 3 has a spec length mean of 69641 and a median of 20280.0
Cluster 4 has a spec length mean of 65690 and a median of 8700.0
Cluster 5 has a spec length mean of 132920 and a median of 85534.0
Cluster 6 has a spec length mean of 62940 and a median of 26579.0
Cluster 7 has a spec length mean of 48941 and a median of 24208.5
Cluster 8 has a spec length mean of 86398 and a median of 46612.0


In [129]:
lengths_struct

[[1, 79640, 24779.0],
 [2, 69359, 29518.0],
 [3, 69641, 20280.0],
 [4, 65690, 8700.0],
 [5, 132920, 85534.0],
 [6, 62940, 26579.0],
 [7, 48941, 24208.5],
 [8, 86398, 46612.0]]

In [138]:
len_df = pd.DataFrame.from_records(lengths_struct)
len_df.columns = ['cluster','mean', 'median']
len_df.set_index('cluster', inplace=True)
len_df

Unnamed: 0_level_0,mean,median
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
1,79640,24779.0
2,69359,29518.0
3,69641,20280.0
4,65690,8700.0
5,132920,85534.0
6,62940,26579.0
7,48941,24208.5
8,86398,46612.0


Now lets look into certain clusters:

In [143]:
cluster_n =  structdf['pam_fit$clustering']==5
structdf[cluster_n]

Unnamed: 0_level_0,swagger,info,host,basePath,schemes,consumes,produces,paths,definitions,parameters,...,tags,externalDocs,title,description,termsOfService,contact,license,version,pam_fit$clustering,spec_len
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
amazonaws.com:AWSMigrationHub,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,5,56011
amazonaws.com:acm,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,5,61774
amazonaws.com:acm-pca,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,5,68471
amazonaws.com:alexaforbusiness,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,5,103083
amazonaws.com:apigateway,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,5,326282
amazonaws.com:application-autoscaling,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,5,108975
amazonaws.com:appstream,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,5,93997
amazonaws.com:appsync,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,5,60180
amazonaws.com:athena,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,5,36270
amazonaws.com:autoscaling,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,5,195066
