# Creation of a data set containing labels of classes for the data samples

In this notebook we create two distinct data sets, that contain the labels for each sample of our main data set. 
The labels are put into their own distinct data set, so that it is easier to use them while training a new model.

In [1]:
import pickle
import json
import pandas as pd
import numpy as np


## labels of path clusters

In [2]:
pathdf = pd.read_csv('PAMpath0518.csv',index_col='X')
pathdf = pathdf.iloc[:,1:]
pathdf.head()

Unnamed: 0_level_0,paths,operations,pam_fit$clustering
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1forge.com,2,2,1
6-dot-authentiqio.appspot.com,5,14,2
adafruit.com,34,69,3
adobe.com:aem,27,31,4
adyen.com:CheckoutService,2,2,1


In [3]:
path_labels = pathdf.copy()
path_labels = path_labels.drop(['paths', 'operations'], 1)
path_labels.index.names = ['API']
path_labels.rename(columns={'pam_fit$clustering':'label'}, inplace=True)
path_labels


Unnamed: 0_level_0,label
API,Unnamed: 1_level_1
1forge.com,1
6-dot-authentiqio.appspot.com,2
adafruit.com,3
adobe.com:aem,4
adyen.com:CheckoutService,1
adyen.com:PaymentService,5
adyen.com:RecurringService,1
afterbanks.com,6
agco-ats.com,7
airport-web.appspot.com,8


In [4]:
path_labels.to_csv('path_labels0518.csv', encoding='utf8')

## labels of structure clusters

In [14]:
pamcompdf = pd.read_csv('PAMcomplete0518.csv',index_col='X')
pamcompdf = pamcompdf.iloc[:,1:]
pamcompdf.head()

Unnamed: 0_level_0,swagger,info,host,basePath,schemes,consumes,produces,paths,definitions,parameters,...,security,tags,externalDocs,title,description,termsOfService,contact,license,version,pam_fit$clustering
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1forge.com,1,1,1,1,1,0,1,1,0,0,...,0,0,0,1,1,0,1,0,1,1
6-dot-authentiqio.appspot.com,1,1,1,1,1,0,0,1,1,1,...,0,0,0,1,1,1,1,1,1,2
adafruit.com,1,1,1,1,1,0,1,1,1,1,...,1,0,0,1,1,0,0,0,1,3
adobe.com:aem,1,1,1,1,1,0,0,1,1,0,...,1,0,0,1,1,0,1,0,1,3
afterbanks.com,1,1,1,1,1,0,1,1,1,0,...,0,0,0,1,1,0,0,0,1,1


In [15]:
spec_key_complete = []
spec_key_complete.append('swagger')
spec_key_complete.append('info')
spec_key_complete.append('host')
spec_key_complete.append('basePath')
spec_key_complete.append('schemes')
spec_key_complete.append('consumes')
spec_key_complete.append('produces')
spec_key_complete.append('paths')
spec_key_complete.append('definitions')
spec_key_complete.append('parameters')
spec_key_complete.append('responses')
spec_key_complete.append('securityDefinitions')
spec_key_complete.append('security')
spec_key_complete.append('tags')
spec_key_complete.append('title')
spec_key_complete.append('description')
spec_key_complete.append('termsOfService')
spec_key_complete.append('contact')
spec_key_complete.append('license')
spec_key_complete.append('version')
spec_key_complete.append('externalDocs')

In [16]:
structure_labels = pamcompdf.copy()
structure_labels = structure_labels.drop(spec_key_complete, 1)
structure_labels.index.names = ['API']
structure_labels.rename(columns={'pam_fit$clustering':'label'}, inplace=True)
structure_labels.head()


Unnamed: 0_level_0,label
API,Unnamed: 1_level_1
1forge.com,1
6-dot-authentiqio.appspot.com,2
adafruit.com,3
adobe.com:aem,3
afterbanks.com,1


In [17]:
structure_labels.to_csv('structure_labels0518.csv', encoding='utf8')

In [5]:
with open('obj/'+ 'path_labels0518' + '.pkl', 'wb') as f:
        pickle.dump(path_labels, f, pickle.HIGHEST_PROTOCOL)

In [18]:
with open('obj/'+ 'structure_labels0518' + '.pkl', 'wb') as f:
        pickle.dump(structure_labels, f, pickle.HIGHEST_PROTOCOL)

In [11]:
import sys
!{sys.executable} -m pip install azure



thinc 6.10.2 requires pathlib<2.0.0,>=1.0.0, which is not installed.
spacy 2.0.11 requires pathlib, which is not installed.
mkl-random 1.0.1 requires cython, which is not installed.
mkl-fft 1.0.0 requires cython, which is not installed.
spacy 2.0.11 has requirement regex==2017.4.5, but you'll have regex 2017.11.9 which is incompatible.
You are using pip version 10.0.0, however version 10.0.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


# Upload data set to blob storage
- https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python

- https://github.com/Azure-Samples/storage-blobs-python-quickstart/blob/master/example.py

In [19]:
import os, uuid, sys
from azure.storage.blob import BlockBlobService, PublicAccess

In [20]:
block_blob_service = BlockBlobService(account_name='openapilp', account_key='4FHmQftrYn99IkPj4tQgGDjyjF8bOPUAu4aQhF54Frb70uI5p9p5zPQkqZRnRhVd4d4ueWAtcFd0KozqRTJtvg==') 

In [21]:
container_name ='labelsdataset'
block_blob_service.create_container(container_name)

Client-Request-ID=18c30db4-5dd5-11e8-9eff-c69ded18204b Retry policy did not allow for a retry: Server-Timestamp=Tue, 22 May 2018 15:30:45 GMT, Server-Request-ID=93dd1f47-401e-00b3-78e1-f13469000000, HTTP status code=409, Exception=The specified container already exists.ErrorCode: ContainerAlreadyExists<?xml version="1.0" encoding="utf-8"?><Error><Code>ContainerAlreadyExists</Code><Message>The specified container already exists.RequestId:93dd1f47-401e-00b3-78e1-f13469000000Time:2018-05-22T15:30:46.5267917Z</Message></Error>.


False

In [22]:
block_blob_service.set_container_acl(container_name, public_access=PublicAccess.Container)

<azure.storage.blob.models.ResourceProperties at 0x16b9422c908>

In [23]:
def uploadFile(container_name, local_file_name, full_path_to_file):
    print(str(block_blob_service.create_blob_from_path(container_name, local_file_name, full_path_to_file)))


In [11]:
path_filename = 'path_labels0518.csv'

In [24]:
struct_filename = 'structure_labels0518.csv'

In [12]:
block_blob_service.create_blob_from_path(container_name, path_filename, 'C:\\Users\\lukas\\PycharmProjects\\monimporter\\path_labels0518.csv')


<azure.storage.blob.models.ResourceProperties at 0x16b941f8898>

In [25]:
block_blob_service.create_blob_from_path(container_name, struct_filename, 'C:\\Users\\lukas\\PycharmProjects\\monimporter\\structure_labels0518.csv')


<azure.storage.blob.models.ResourceProperties at 0x16b941f8390>

List the created blobs

In [26]:
print("\nList blobs in the container")
generator = block_blob_service.list_blobs(container_name)
for blob in generator:
    print("\t Blob name: " + blob.name)


List blobs in the container
	 Blob name: path_labels.csv
	 Blob name: path_labels0518.csv
	 Blob name: structure_labels.csv
	 Blob name: structure_labels0518.csv


You could download any of these two blobs by using the following snippet
```
block_blob_service.get_blob_to_path(container_name, local_file_name, full_path_to_file2)
```