#### 1. Installing the required python packages

In [0]:
!python -m pip install --upgrade pip



In [0]:
!pip install -r requirements38.txt

Collecting absl-py==0.7.1
  Downloading absl-py-0.7.1.tar.gz (99 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/99.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/99.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m61.4/99.9 kB[0m [31m827.5 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.9/99.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting astor==0.8.0
  Downloading astor-0.8.0-py2.py3-none-any.whl (27 kB)
Collecting boto==2.49.0
  Downloading boto-2.49.0-py2.py3-none-any.whl (1.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [0]:
!pip install azure-storage-file-datalake
!pip install adlfs
!pip install fsspec

Collecting azure-storage-file-datalake
  Downloading azure_storage_file_datalake-12.9.1-py3-none-any.whl (238 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/238.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/238.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/238.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/238.8 kB[0m [31m1.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m235.5/238.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m235.5/238.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.8/238.8 kB[0m [31m1.3 M

#### 2. Importing the required python packages

In [0]:
from datetime import datetime
import os
import sys
import time
import pickle

import numpy as np
import pandas as pd
import json
import traceback
import ast

from sherlock.functional import extract_features_to_csv
from sherlock.helpers import download_data
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (extract_features,convert_string_lists_to_lists,prepare_feature_extraction,load_parquet_values)
from sherlock.features.word_embeddings import initialise_word_embeddings



#### 3. Loading the required model objects for the feature creation

In [0]:
par_vec_path = '/dbfs/'

# prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(path=par_vec_path,dim=400)
initialise_nltk()

Initialising word embeddings
Initialise Word Embeddings process took 0:00:06.227651 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:05.764275 seconds. (filename = /dbfs//par_vec_trained_model_400.pkl)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Initialised NLTK, process took 0:00:01.414576 seconds.


#### 4. Connecting to the ADLS for reading all the files for feature creation

In [0]:
import os, uuid, sys
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core._match_conditions import MatchConditions
from azure.storage.filedatalake._models import ContentSettings
from azure.storage.blob import BlobServiceClient

In [0]:
def initialize_storage_account(storage_account_name, storage_account_key):
    
    try:  
        global service_client

        service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
            "https", storage_account_name), credential=storage_account_key)
    
    except Exception as e:
        print(e)

In [0]:
def list_directory_contents(container_name,my_dir):
    file_list = []
    try:
        
        file_system_client = service_client.get_file_system_client(file_system=container_name)

        paths = file_system_client.get_paths(path=my_dir)

        for path in paths:
            file_list.append(path.name)

    except Exception as e:
     print(e)
    
    return file_list

#### 5. Credentials to connect to the Azure Data Storage

In [0]:
storage_account = "legoaistorage"
storage_account_key = "vOHAjE9vOHaxqmTRxIYETQbYlPvvFpJQ7xfky8tuWBRE9E6IbfM87ERkGcqqiHfMHs+WnEt907r6+AStjIYXlA=="
initialize_storage_account(storage_account,storage_account_key)

In [0]:
### Spark Configuration
spark.conf.set("fs.azure.account.key."+ storage_account +".dfs.core.windows.net", storage_account_key)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

#### 6. Parsing and extracting file information

In [0]:
### Getting the data and other required information from each source
container_name = 'datascience-dataset'
data_content = list_directory_contents(container_name,'Source_Data')
connect_str = 'DefaultEndpointsProtocol=https;AccountName={};AccountKey={}'.format(storage_account,storage_account_key)

#### 7. Iterating through the data and get all the required jsons file info

In [0]:
### Subset data which is of json file format
json_data_content = [content for content in data_content if content.endswith('.json')]

json_meta = []
for cont in json_data_content:
    json_meta.append([cont.split('/')[1],cont.split('/')[-1],cont])
    
json_meta_df =  pd.DataFrame(json_meta,columns= ['source','filename','filepath'])
json_meta_df = json_meta_df.reset_index(drop=True)

In [0]:
json_meta_df = json_meta_df[(json_meta_df['source']=='dipanjan')].reset_index(drop=True)
json_meta_df = json_meta_df[json_meta_df['filepath'].str.contains('large')].reset_index(drop=True)
json_meta_df.shape

Out[21]: (145, 3)

In [0]:
def data_conversion(filename,filepath):
    
    ## Identify the file extension
    file_extension = filename.rsplit('.',1)[1]
    full_filepath = "abfss://datascience-dataset@legoaistorage.dfs.core.windows.net/"+filepath
    
    ## Read the data based on extension
    if file_extension == 'json':
        data_dict = spark.read.option("multiline", "true").json(full_filepath)
        data = data_dict.toPandas()

    elif file_extension == 'parquet':
        data = spark.read.parquet(full_filepath)
            
    elif file_extension == 'csv':
        data = spark.read.format("csv").option("mode", "PERMISSIVE").load(full_filepath)
        
    elif file_extension == 'txt':
        data = spark.read.text(full_filepath)
    
    else:
        data = pd.DataFrame(columns=['id','table_name','column_name','values']) 
    
    return data

In [0]:
def meta_information_check(data_df,filename):

    ### ID creation for each df
    if 'id' in data_df.columns:
        
        if data_df['id'].nunique() != data_df.shape[0]:
            data_df['id'] = [i for i in range(len(data_df))]        
            
    elif 'column_id' in data_df.columns:
        data_df['id'] = data_df['column_id']
    else:
        data_df = data_df.reset_index(drop=True).reset_index()
        data_df = data_df.rename(columns={'index':'id'})
    
    #### Mandatory column checks
    if 'column_name' not in data_df.columns:
        
        if 'type' in data_df.columns:
            data_df['column_name'] = data_df['type']
        else:
            data_df['column_name'] = ''   ### Need to check with             
    
    if 'table_name' not in data_df.columns:
        data_df['table_name'] = filename.rsplit('.',1)[0]
    
    data_df = data_df.rename(columns={'column_values':'values','value':'values'})
    
    data_df['values'] = data_df.apply(lambda x: list(x['values']),axis=1)
    print('Unique Type of values:', list(set([type(val) for val in data_df['values'].tolist()])))
    
    assert list(set([type(val) for val in data_df['values'].tolist()]))[0] == list
    
    data_df['master_id'] = data_df.apply(lambda x: x['dataset_name']+'$$##$$'+x['table_name']+'$$##$$'+x['column_name'],axis=1)    
    assert data_df['master_id'].nunique()==data_df.shape[0]

    return data_df

#### 8. Storing the Information to Azure Blob Storage

In [0]:
### Blob storage based configurations
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client('datascience-dataset/Features')

In [0]:
def save_df_to_blob(df,output_file):

    try:
        output = df.to_csv(index=False, encoding = "utf-8")
        
        # Instantiate a new BlobClient
        blob_client = container_client.get_blob_client(output_file)
        
        # upload data
        blob_client.upload_blob(output, blob_type="BlockBlob")
        
        return 1
    
    except:
        return 0

In [0]:
for row in range(len(json_meta_df)): 
    
    status = 0
    try:
        
        print('Feature Creation Started!!')
        ## Required variables
        filename,filepath = json_meta_df[['filename','filepath']].iloc[row].tolist()

        ### Extracting data from json    
        json_df = data_conversion(filename,filepath)

        if json_df.shape[0] ==0:
            continue
        
        ### Extracting meta data    
        meta_json_df = meta_information_check(json_df,filename)
        print('Meta Data Row Count: ',meta_json_df.shape)
        
        ### Extracting features from data
        df = extract_features_to_csv(meta_json_df)
        
        ### Saving the output to the folder
        full_filepath = filename.replace('.json','.csv').replace('.parquet','.csv').replace('.xlsx','.csv').replace('.txt','.csv').replace('.csv','_feats.csv')
        
        status = save_df_to_blob(df,full_filepath) 
        
    except Exception as e:
        print(traceback.format_exc())
        print(e)

    print('Feature Creation Completed!!')
    print(row,status)        

#### 9. Iterating through the data and combine features

In [0]:
### Getting the data and other required information from each source
container_name = 'datascience-dataset'
data_content = list_directory_contents(container_name,'Features')

In [0]:
features_df = pd.DataFrame()
for i,filename in enumerate(data_content):
    full_filepath = "abfss://datascience-dataset@legoaistorage.dfs.core.windows.net/"+filename
    feats_csv = spark.read.format("csv").option("header","true").load(full_filepath)
    feats_pandas = feats_csv.toPandas()
    feats_pandas['file_name'] = filename.split('/')[1].replace('.csv','')
    features_df = pd.concat([features_df,feats_pandas])
    print(len(data_content)-i)

In [0]:
### Blob storage based configurations
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client('datascience-dataset/Model_Data')
filepath = "model_feats_data.csv"
save_df_to_blob(features_df,filepath)

#### 10. Evaluating Execution Performance

In [0]:
features_df['execution_time'] = pd.to_timedelta(features_df['execution_time'])
features_df = features_df.reset_index(drop=True)
features_df['execution_seconds'] = features_df['execution_time']/np.timedelta64(1, 's')

In [0]:
new_features_df = features_df[['repo_name','file_name','execution_time','execution_seconds']].drop_duplicates().reset_index()
new_features_df = new_features_df.drop(columns=['index'])

In [0]:
### Blob storage based configurations
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client('datascience-dataset/Model_Data')
filepath = "feature_execution_data.csv"
save_df_to_blob(new_features_df,filepath)

In [0]:
### Getting the data and other required information from each source
container_name = 'datascience-dataset'
data_content = list_directory_contents(container_name,'Source_Data')
all_files = pd.DataFrame([file.split('/')[-1].replace('.json','_feats') for file in data_content if file.endswith('.json')],columns=['file_name'])

In [0]:
merged_df = pd.merge(all_files,features_df[['repo_name','file_name']],how='left')
merged_df[pd.isnull(merged_df['repo_name'])]['file_name'].tolist()

#### Appendix

In [0]:
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

STOPWORDS_ENGLISH = stopwords.words("english")

def tokenise(values):
    joined = " ".join(s for s in values if len(s) >= 1)

    # stopwords need apostrophe
    filtered = "".join(
        e for e in joined if e.isalnum() or e.isspace() or e == "'"
    ).lower()

    return [
        word
        for word in nltk.word_tokenize(filtered)
        if len(word) >= 1 and word not in STOPWORDS_ENGLISH
    ]

In [0]:
filename = f"/dbfs/par_vec_trained_model_400.pkl"        
model = Doc2Vec.load(filename)

In [0]:
model.random.seed(13)
col_values = meta_json_df[meta_json_df['id']=='nov2019_deliveries_wide_runs_11']['values'].tolist()[0]
tokens = tokenise(col_values)
inferred = model.infer_vector(tokens, steps=20, alpha=0.025)

In [0]:
def alphaAndNumericMatch(value):
    
    value = str(value)
    charCount = len(re.findall(string = value,pattern='[a-zA-Z]'))
    numCount = len(re.findall(string = value,pattern='\d'))
    specialCharCount = len(re.findall(string=value,pattern='[!#&\'()*+-/:;<=>?@[\\]^_`{|}~]'))

    if (charCount >0 or specialCharCount) and numCount>0:
        return 'alphanumeric'
    elif numCount > 0:
        return 'numeric'
    elif charCount > 0:
        return 'alpha'
    else:
        return 'others'
