In [1]:
import datrie
import string
import pandas as pd
import numpy as np
import numba as nb
import os
import warnings

from numba import njit,jit
from numba.typed import Dict, List
from numba import types
from collections import namedtuple

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# ========== 3.1 Tag Name to Tag Master Name Mapping ==========

- Load all dataset

In [2]:
df_list = []
for dirpath, dirnames, filenames in os.walk('data'):
    for filename in filenames:
        df = pd.read_csv(dirpath+'/'+filename, low_memory=False, header=1)
        df = df[['Date', 'AccountId', 'AccountName', 'DepartmentId', 'DepartmentName', 'InstanceId', 'ResourceGroup', 'Tags']]
        df['ResourceName'] = df['InstanceId'].str.split('/').apply(lambda x: x[-1])
        df_list.append(df.drop(columns=['InstanceId']))
raw_data = pd.concat(df_list)

In [3]:
data = raw_data.dropna(subset=['Tags', 'ResourceGroup']).reset_index()

In [5]:
x = data.iloc[0]['Tags']
x = x.replace('"', '')
x = x[3:-1]
x.split(',  ')

['Client: Internal', 'Environment: Production', 'Type: Shared']

In [4]:
# all_tags_list = data['Tags'].str.replace(' ', '').tolist()
# all_tags_list = df['Tags'].apply(lambda x: json.loads(x)).tolist()
# all_tags_list = data['Tags'].apply(lambda x: namedtuple('Tags', json.loads(x).keys())._make(json.loads(x))).tolist()
# all_tags_list = data['Tags'].apply(lambda x: )

In [6]:
# from numba import guvectorize
# numba_dict = Dict.empty(key_type=types.int64, value_type=types.ListType(types.string))
numba_dict = Dict.empty(key_type=types.int64, value_type=types.UniTuple(types.string, 2))
all_tags_list = data['Tags'].tolist()
# numba_list = List()
# numba_list.append('asd')
# numba_list.pop()
# for tags in all_tags_list:
#     numba_list.append(tags)

@jit(nopython=True, nogil=True, parallel=True)
# @guvectorize("numba.types.DictType(int64, UniTuple(unicode_type, 2))(ListType(unicode_type),DictType(int64, UniTuple(unicode_type, 2)))", target='cpu')
def get_all_tags_typed_dict(all_tags_list, numba_dict):
    index = np.int64(0)
    for tags in all_tags_list:
        tags = tags.replace('"', '')
        tags = tags[3:-1].split(',  ')
        for tag in tags:
            tmp = tag.split(': ')
            numba_dict[index] = (tmp[0], tmp[1])
            index += 1

    return numba_dict


all_tags_typed_dict = get_all_tags_typed_dict(all_tags_list, numba_dict)

In [7]:
# all_tags_dict = {}
# for key in all_tags_typed_dict.keys():
#     all_tags_dict[key] = np.array(all_tags_typed_dict[key]).astype('<U64')
# all_tags_dict = pd.DataFrame(all_tags_dict)
# all_tags_dict = pd.DataFrame.from_dict(all_tags_dict, orient='index').T
all_tags_dict = pd.DataFrame.from_dict(all_tags_typed_dict, orient='index')
all_tags_dict.columns = ['Tag Name', 'Tag Value']

In [8]:
tag_names = all_tags_dict.groupby('Tag Name')['Tag Value'].count()
tag_names = tag_names[tag_names > 5].to_dict()  # only use the Tag Names appeared more than 5 times
tag_values = all_tags_dict.groupby('Tag Value')['Tag Name'].count()
tag_values = tag_values[tag_values > 5].to_dict()  # only use the Tag Names appeared more than 5 times
# typed_tag_names = Dict.empty(types.unicode_type, types.int64)
# for key in tag_names.keys():
#     typed_tag_names[key] = np.int64(tag_names[key])

In [9]:
def Levenshtein_Dynamic(str1, str2):
    str1, str2 = ''.join(filter(str.isalnum, str1.lower())), ''.join(filter(str.isalnum, str2.lower()))
    if str1 in str2 or str2 in str1:
        return 0

    len_str1 = len(str1) + 1
    len_str2 = len(str2) + 1
    # 创建矩阵
    matrix = [0 for n in range(len_str1 * len_str2)]
    #矩阵的第一行
    for i in range(len_str1):
        matrix[i] = i
    # 矩阵的第一列
    for j in range(0, len(matrix), len_str1):
        if j % len_str1 == 0:
            matrix[j] = j // len_str1
    # 根据状态转移方程逐步得到编辑距离
    for i in range(1, len_str1):
        for j in range(1, len_str2):
            if str1[i - 1] == str2[j - 1]:
                cost = 0
            else:
                cost = 1
            matrix[j * len_str1 + i] = min(matrix[(j - 1) * len_str1 + i] + 1, matrix[j * len_str1 + (i - 1)] + 1,
                                           matrix[(j - 1) * len_str1 + (i - 1)] + cost)
    if matrix[-1] > min(len(str1), len(str2)) / 2:
        return 1000
    return matrix[-1]  # 返回矩阵的最后一个值，也就是编辑距离

In [69]:
import numpy as np
import sklearn.cluster

# words = "Client clien client Shared".split(" ") #Replace this line
words = list(tag_names.keys())
words = np.asarray(words)  #So that indexing with a list will work
lev_similarity = [[-Levenshtein_Dynamic(w1, w2) for w2 in words] for w1 in words]
x = list(tag_names.values())
x_sum = np.sum(x)
# y = np.argsort(x)
# preference = [0] * len(x)
# j = len(x)
# for i in y:
#     preference[i] = -j
#     j -= 1
preference = [-(1 - i / x_sum)*3 for i in x]

master_names_dict = {}
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5, preference=preference, max_iter=1000, convergence_iter=20)
affprop.fit(lev_similarity)
tag_master_names = {}
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_ == cluster_id)])
    tag_master_names[exemplar] = cluster.tolist()
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))
    master_names_dict[exemplar] = cluster
    # for c in cluster:
    #     print(Levenshtein_Dynamic(exemplar, c), end=', ')
    # print()

Converged after 29 iterations.
 - *Client:* Clien, Client, client, x_Client
 - *ClusterId:* ClusterId
 - *ClusterName:* ClusterName
 - *Creator:* Creator, DatabricksInstancePoolCreatorId, x_Creator
 - *DataFactoryEntityResourceId:* DataFactoryEntityResourceId
 - *Databricks-ElasticDisk:* Databricks-ElasticDisk
 - *DatabricksInstanceGroupId:* DatabricksInstanceGroupId
 - *DatabricksInstancePoolId:* DatabricksInstancePoolId
 - *Environment:* DatabricksEnvironment, Enviornment, Environment, databricks-environment, environment
 - *JobId:* JobId
 - *MappingDataflowRunId:* MappingDataflowRunId
 - *Product:* Product
 - *ResourceClass:* ResourceClass
 - *RunName:* RunName
 - *Service:* Service, x_Service
 - *SqlEndpointId:* SqlEndpointId
 - *Type:* Type, type, x_Type
 - *Vendor:* Vendor
 - *application:* application
 - *createdBy:* createdBy
 - *databricks-instance-name:* databricks-instance-name
 - *dbsql-channel:* dbsql-channel
 - *hidden-link:/subscriptions/f6650bec-0252-45ce-b7ce-8fff251c4

In [105]:
len(x[0])

28

In [33]:
master_names_df = pd.DataFrame.from_dict(master_names_dict, orient='index')
master_names_df.to_csv('Tag Master Names.csv', sep=',', header=False, index=True)

In [108]:
test_values = all_tags_dict[all_tags_dict['Tag Name'].str.contains('|'.join(tag_master_names['Environment']))].dropna()
test_values = test_values.groupby('Tag Value')['Tag Name'].count()
if '' in test_values.keys():
    test_values.pop('')
test_values = test_values.to_dict()
test_values

{'Development': 24628,
 'PoV': 21485,
 'Production': 149541,
 'Shared': 785,
 'workerenv-1002506118669650': 420,
 'workerenv-1007669364905573': 413,
 'workerenv-1273686276464348': 121,
 'workerenv-128941113832063': 477,
 'workerenv-1310124120340684': 313,
 'workerenv-1363511407098515': 428,
 'workerenv-1423586859234944': 371,
 'workerenv-158615000393787': 422,
 'workerenv-1713528652783932': 406,
 'workerenv-1778456081878316': 454,
 'workerenv-1810343225777445': 156,
 'workerenv-1850437238891750': 366,
 'workerenv-1934208234354281': 458,
 'workerenv-204037881721143': 448,
 'workerenv-2040919574862727': 291,
 'workerenv-2140633197113253': 432,
 'workerenv-2151962545592826': 518,
 'workerenv-2163878406020497': 402,
 'workerenv-2177263479227743': 484,
 'workerenv-2227347226153301': 121,
 'workerenv-2233824289521111': 482,
 'workerenv-2395893230463898': 140,
 'workerenv-2438246694412658': 467,
 'workerenv-2516954196764457': 438,
 'workerenv-2577205424225732': 489,
 'workerenv-26577386533924

In [61]:
words = list(test_values.keys())
words = np.asarray(words)  #So that indexing with a list will work
lev_similarity = [[-Levenshtein_Dynamic(w1.lower(), w2.lower()) for w2 in words] for w1 in words]
x = list(test_values.values())
x_sum = np.sum(x)
y = np.argsort(x)
# preference = [0] * len(x)
# j = len(x)
# for i in y:
#     preference[i] = -j
#     j -= 1
preference = [-(1-i/x_sum) for i in x]

master_values_dict = {}
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed",
                                              damping=0.5,
                                              verbose=True,
                                              preference=preference,
                                              max_iter=1000,
                                              convergence_iter=50)
affprop.fit(lev_similarity)
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_ == cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))
    master_values_dict[exemplar] = cluster
    # for c in cluster:
    #     print(jaro.jaro_winkler_metric(exemplar.lower(), c.lower()), end=', ')
    # print()

Converged after 50 iterations.
 - *Cloud:* Cloud
 - *Standard 2B Model:* Standard 2B Model
 - *Standard F2B Model:* Standard F2B Model
 - *Standard Model- F2B Model:* Standard Model- F2B Model
 - *atabricks:* atabricks


In [49]:
master_values_df = pd.DataFrame.from_dict(master_values_dict, orient='index')
master_values_df.to_csv('Tag Master Values.csv', sep=',', header=False, index=True)

In [166]:
str1 = "acbcbcef"
str2 = "abcbced"
M = [[0 for i in range(len(str1) + 1)] for i in range(len(str2) + 1)]
xmax = 0
xindex = []
for i in range(1, len(str2) + 1):
    for j in range(1, len(str1) + 1):
        if (str2[i - 1] == str1[j - 1]):
            M[i][j] = M[i - 1][j - 1] + 1
            if M[i][j] > xmax:
                xmax = M[i][j]
                xindex = [(i, j)]
            elif M[i][j] == xmax:
                xindex.append((i, j))
lcstr = ",".join([str1[i - xmax:i] for i in xindex])
print(lcstr)

TypeError: unsupported operand type(s) for -: 'tuple' and 'int'

In [16]:
Levenshtein_Dynamic('DatabricksInstancePoolId'.lower(), 'databricks-instance-name')

7

In [10]:
from Trie import Node,add, search
def generate_tree_hash(tag_names):
    n = Node("")
    for tag in tag_names:
        add(n, tag, weight=tag_names[tag])
    return n

n = generate_tree_hash(tag_values)
print("search clien: ")
for key, node in search(n, u'ABSA', is_case_sensitive=True):
    print(key, node.weight)


search clien: 
ABSACLOUDQA 2310
ABSARBB 1651
ABSACF 1588
ABSACLOUD313 409
ABSA 228


In [147]:
def generate_tree(tag_names, file_name, save):
    trie = datrie.Trie(string.ascii_letters+string.digits+' -_')
    for tag in tag_names:
        trie[tag] = tag_names[tag]
    if save:
        trie.save(file_name)
    return trie

# trie_tag_names = generate_tree(tag_names, 'Tag Name Trie.txt', True)
trie_tag_values = generate_tree(test_values, 'Tag Value Trie.txt', True)

In [149]:
from Get_Similar_Words import search

# trie_tag_names = datrie.Trie.load('Tag Name Trie.txt')
# state_tag_names = datrie.State(trie_tag_names)
# state_last_tag_names = datrie.State(trie_tag_names)
trie_tag_values = datrie.Trie.load('Tag Value Trie.txt')
state_tag_values = datrie.State(trie_tag_values)
state_last_tag_values = datrie.State(trie_tag_values)

# tag_name = 'clien'
# result_tag_names = search(trie_tag_names, state_last_tag_names, state_tag_names, tag_name, tag_name, '', [], {}, 0)
# print(tag_name, result_tag_names)
tag_value = 'ABSACLOUD313'
result_tag_values = search(trie_tag_values, state_last_tag_values, state_tag_values, tag_value, tag_value, '', [], {}, 0)
print(tag_value, result_tag_values)

ABSACLOUD313 [['ABSA', 1000, 228], ['ABSACLOUD313', 0, 409], ['ABSACLOUDQA', 3, 2310], ['ABSACF', 1000, 1588], ['ARM', 1000, 2567], ['Absa', 1000, 11034], ['HC', 1000, 453], ['RMB', 1000, 4038]]


In [13]:
# trie_tag_values = datrie.Trie.load('Tag Value Trie.txt')
# trie_tag_values.prefix_items(tag_value)

In [107]:
import Levenshtein
import jaro

Levenshtein.distance('Absa'.lower(), 'RMB'.lower())

4