In [0]:
!python -m pip install --upgrade pip

In [0]:
!pip install -r requirements38.txt

In [0]:
!pip install azure-storage-file-datalake
!pip install adlfs
!pip install fsspec

In [0]:
!pip install trieregex

In [0]:
import multiprocessing as mp
import sys
from datetime import datetime
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ast import literal_eval
import string

In [0]:
print(f'Started at {datetime.now()}')

#### Connect to Azure Data Storage

In [0]:
import os, uuid, sys
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core._match_conditions import MatchConditions
from azure.storage.filedatalake._models import ContentSettings
from azure.storage.blob import BlobServiceClient

In [0]:
def initialize_storage_account(storage_account_name, storage_account_key):
    
    try:  
        global service_client

        service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
            "https", storage_account_name), credential=storage_account_key)
    
    except Exception as e:
        print(e)

In [0]:
def list_directory_contents(container_name,my_dir):
    file_list = []
    try:
        
        file_system_client = service_client.get_file_system_client(file_system=container_name)

        paths = file_system_client.get_paths(path=my_dir)

        for path in paths:
            file_list.append(path.name)

    except Exception as e:
     print(e)
    
    return file_list

In [0]:
storage_account = "legoaistorage"
storage_account_key = "vOHAjE9vOHaxqmTRxIYETQbYlPvvFpJQ7xfky8tuWBRE9E6IbfM87ERkGcqqiHfMHs+WnEt907r6+AStjIYXlA=="
initialize_storage_account(storage_account,storage_account_key)

In [0]:
### Spark Configuration
spark.conf.set("fs.azure.account.key."+ storage_account +".dfs.core.windows.net", storage_account_key)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [0]:
prefix_path = "abfss://datascience-dataset@legoaistorage.dfs.core.windows.net/"
connect_str = 'DefaultEndpointsProtocol=https;AccountName={};AccountKey={}'.format(storage_account,storage_account_key)

In [0]:
### Getting the data and other required information from each source
container_name = 'datascience-dataset'
data_content = list_directory_contents(container_name,'Source_Data')

In [0]:
### Subset data which is of json file format
json_data_content = [content for content in data_content if content.endswith('.json')]

json_meta = []
for cont in json_data_content:
    json_meta.append([cont.split('/')[1],cont.split('/')[-3],cont.split('/')[-2],cont.split('/')[-1],cont])
    
json_meta_df =  pd.DataFrame(json_meta,columns= ['source','location','reponame','filename','filepath'])
json_meta_df = json_meta_df.reset_index(drop=True)

In [0]:
def data_conversion(filename,filepath):
    
    ## Identify the file extension
    file_extension = filename.rsplit('.',1)[1]
    full_filepath = "abfss://datascience-dataset@legoaistorage.dfs.core.windows.net/"+filepath
    
    ## Read the data based on extension
    if file_extension == 'json':
        data_dict = spark.read.option("multiline", "true").json(full_filepath)
        data = data_dict.toPandas()

    elif file_extension == 'parquet':
        data = spark.read.parquet(full_filepath)
            
    elif file_extension == 'csv':
        data = spark.read.format("csv").option("mode", "PERMISSIVE").load(full_filepath)
        
    elif file_extension == 'txt':
        data = spark.read.text(full_filepath)
    
    else:
        data = pd.DataFrame(columns=['id','table_name','column_name','values']) 
    
    return data

In [0]:
def meta_information_check(data_df,filename,reponame):
    
    ### ID creation for each df
    if 'id' in data_df.columns:
        
        if data_df['id'].nunique() != data_df.shape[0]:
            data_df['id'] = [i for i in range(len(data_df))]        
            
    elif 'column_id' in data_df.columns:
        data_df['id'] = data_df['column_id']
    else:
        data_df = data_df.reset_index(drop=True).reset_index()
        data_df = data_df.rename(columns={'index':'id'})
    
    #### Mandatory column checks
    if 'column_name' not in data_df.columns:
        
        if 'type' in data_df.columns:
            data_df['column_name'] = data_df['type']
        else:
            data_df['column_name'] = ''   ### Need to check with             
    
    if 'table_name' not in data_df.columns:
        data_df['table_name'] = filename.rsplit('.',1)[0]
    
    data_df = data_df.rename(columns={'column_values':'values','value':'values'})
    
    data_df['values'] = data_df.apply(lambda x: list(x['values']),axis=1)
    print('Unique Type of values:', list(set([type(val) for val in data_df['values'].tolist()])))
    
    assert list(set([type(val) for val in data_df['values'].tolist()]))[0] == list
    
    if reponame == 'swastik':
        data_df['repo_name'] = data_df['dataset_name']
    else:
        data_df['repo_name'] = reponame
        
    data_df['master_id'] = data_df.apply(lambda x: x['repo_name']+'$$##$$'+x['table_name']+'$$##$$'+x['column_name'],axis=1)    
    assert data_df['master_id'].nunique()==data_df.shape[0]

    return data_df

In [0]:
for row in range(1): #len(json_meta_df)): 
    
    status = 0
    try:
        
        print('Feature Creation Started!!')
        ## Required variables
        reponame,filename,filepath = json_meta_df[['reponame','filename','filepath']].iloc[row].tolist()

        ### Extracting data from json    
        json_df = data_conversion(filename,filepath)

        if json_df.shape[0] ==0:
            continue
        
        ### Extracting meta data    
        meta_json_df = meta_information_check(json_df,filename,reponame)
        print('Meta Data Row Count: ',meta_json_df.shape)
        
    except Exception as e:
        print(traceback.format_exc())
        print(e)

    print('Feature Creation Completed!!')
    print(row,status)

In [0]:
def special_token_repl(text: str, suffix: str):
    replaced_text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    replaced_text = re.sub(string=replaced_text,pattern=' +',repl=' ')
    
    if replaced_text == '':
        replaced_text = 'unknown' + suffix
    
    return replaced_text

In [0]:
def normalise_whitespace(data):
    if isinstance(data, str):
        return re.sub(r"\s{2,}", " ", data.strip())
    else:
        return data

In [0]:
ignoreList = ['#na','#n/a','na','n/a','none','nan','blank','blanks','nil','n.a.','n.a',
             '"#na"','"#n/a"','"na"','"n/a"','"none"','"nan"','"blank"','"blanks"','"nil"','"n.a."','"n.a"',
             "'#na'","'#n/a'","'na'","'n/a'","'none'","'nan'","'blank'","'blanks'","'nil'","'n.a.'","'n.a'"]

def additional_processing(value):
    
    #print('Additional Processing:',value)
    if value is None or pd.isnull(value) or str(value).lower() in ignoreList:
      return_val = ''
    else:
      value = str(value).replace('\xa0',' ').strip()
      return_val = removeASCII(value)

    return return_val

In [0]:
def normalise_string_whitespace(col_values):

    master_id = col_values[0]
    id = col_values[1]
    dataset_name = col_values[2]
    table_name = col_values[3]
    column_name = col_values[4]
    
    normalized_values = list(map(normalise_whitespace, col_values[5:]))
    
    ### Removing the table and column name from values ## Added to remove features list
    normalized_values = [val for val in normalized_values if str(val).lower() not in [dataset_name.lower() ,table_name.lower(),column_name.lower()]]
    
    normalized_values_upd = [master_id] + [id] + [dataset_name] + [table_name] + [column_name] + normalized_values
    return normalized_values_upd

In [0]:
#### Remove ASCII Characters from the data
def removeASCII(strs):
    return ''.join([char for word in str(strs) for char in word if ord(char)<128])

In [0]:
def cleaning_data(col_values):
    
    master_id = col_values[0]
    id = col_values[1]
    dataset_name = col_values[2]
    table_name = col_values[3]
    column_name = col_values[4]
    col_values = col_values[5:]
    
    table_name_clean = special_token_repl(table_name,suffix='_table_name')
    column_name_clean = special_token_repl(column_name,suffix='_column_name')
    
    cleaned_values = [additional_processing(val) for val in col_values]
    return cleaned_values

In [0]:
def checkInt(strs):
        
    ### If integer type then return else return 0
    if isinstance(strs,int):
        return 1
    elif isinstance(strs,float):
        return 0
    else:
        try:
            int(strs)
            return 1
        except:
            return 0

In [0]:
### Check if the  data is of Float type or not
def checkFloat(strs):
        
    ### If Float type then return else return 0
    if isinstance(strs,float):
        return 1
    else:
        try:
            if checkInt(strs):
                return 0
            strs = float(strs)
            if strs != np.inf:
                return 1
            else:
                return 0
        except:
            return 0

In [0]:
def featureCreation(values):
    uniq_clean_values = list(set(values))
    total_vals = len(values)
    uniq_vals = len(uniq_clean_values)
    
    int_ratio = np.mean([checkInt(val) for val in values])
    float_ratio = np.mean([checkFloat(val) for val in values])
    
    print(int_ratio,float_ratio)

In [0]:
for eachRow in range(1):#meta_json_df.shape[0]):
    clean_row_values = cleaning_data(meta_json_df['values'][0])
    featureCreation(clean_row_values)

In [0]:
import re
from trieregex import TrieRegEx as TRE

words = ['\d{4}-[0-1][0-9]-[0-3][0-9]']

# Initialize class instance
tre = TRE()

# Add word(s)
tre = TRE(*words)  # word(s) can be added upon instance creation, or after

# Create regex pattern from the trie
tre.regex()  # Returns: '(?:tange(?:rine|lo)|grape(?:fruit)?|kumquat)'
print(tre.regex())

# # Add boundary context and compile for matching
pattern = re.compile(f'\\b{tre.regex()}\\b')  # OR rf'\b{tre.regex()}\b'
print(pattern)
pattern.findall("check in January 2022-01-01")  # Returns: ['kumquat']

In [0]:
re.findall(re.compile(words[0]),string='check in January 2022-01-01')