# **Download Required third party dependancies**

In [2]:
try:
  !pip install -q fuzzywuzzy metaphone whoosh jellyfish
except Exception:
    pass
#
# import tensorflow as tf
# !wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
# !unzip ngrok-stable-linux-amd64.zip
# !pip install tensorboardcolab
# !wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
# !unzip ngrok-stable-linux-amd64.zip
# LOG_DIR = './log'
# get_ipython().system_raw('tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'.format(LOG_DIR))
# get_ipython().system_raw('./ngrok http 6006 &')
# ! curl -s http://localhost:4040/api/tunnels | python3 -c "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"
# # **Load the Data set**
# from google.colab import files
# data =  files.upload()

# **Load Required Libraries**

In [3]:
import io
import datetime
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
from tensorflow.keras.preprocessing.text import Tokenizer, hashing_trick, one_hot, text_to_word_sequence

from tensorflow.keras.optimizers import RMSprop
from jellyfish import jaro_winkler, levenshtein_distance, soundex
from whoosh.analysis import StandardAnalyzer
from metaphone import doublemetaphone
from fuzzywuzzy import fuzz



# **Taxonomy Declaration**

In [4]:
ACCOUNT_NAME = 'accountName'
ACCOUNT_KEY = 'accountKey'
CONTAINER_NAME = 'containerName'
ANALYTICS_EXTERNAL = 'analytics-external'
BLOB_END_SUFFIX = '.blob.core.windows.net'
BLOB_PREFIX = 'fs.azure.account.key.'
ANALYTICS_INTERNAL = 'analytics-internal'
LEVENSHTEIN = 'levenshtein'
SOUNDEX = 'soundex'
OVERLAP = 'overlap'
OVERLAPLEVENSHTEIN = 'overlapLevenshtein'
JACCARD = 'jaccard'
JACCARDLEVENSHTEIN = 'jaccardLevenshtein'

In [5]:
np.set_printoptions(precision=4)

df = pd.read_csv("/home/sachin/#Riversand/Match_Non_Match/BigData.csv")

df['system_universalbusinessnumber_11'] = df[
    'system_universalbusinessnumber_11'].astype(str)

df['system_universalbusinessnumber_12'] = df[
    'system_universalbusinessnumber_11'].astype(str)

df = df.drop([
    'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18',
    'feature_19', 'feature_20', 'feature_21'
],
             axis=1)

In [7]:
algos = {
    'jaro_winkler': [],
    'exact_match': [],
    'overlapLevenshtein': [
        'system_businessname', 'system_address', 'system_alternatename',
        'system_phonenumber', 'system_universalbusinessnumber'
    ],
    'soundex': ['system_businessname', 'system_alternatename']
}

cols = [
    'id', 'system_businessname_1', 'system_businessname_2',
    'system_alternatename_3', 'system_alternatename_4', 'system_address_5',
    'system_address_6', 'system_phonenumber_7', 'system_phonenumber_8',
    'system_dateofinception_9', 'system_dateofinception_10',
    'system_universalbusinessnumber_11', 'system_universalbusinessnumber_12',
    'LABEL'
]

# **Data Preprocessing Steps**

In [8]:
def tokenize(text):
    analyzer = StandardAnalyzer()
    return [t.text for t in analyzer(text)]


def overlap(A, B):
    try:
        if A is None or B is None or len(A) == 0 or len(B) == 0:
            return 0.0
        setA = tokenize(A)
        setB = tokenize(B)
        num_intersection = setA.intersection(setB)
        min_len = len(setA)
        if (min_len > len(setB)):
            min_len = len(setB)
        return float(intersection) / min_len
    except:
        pass


def jaccard(A, B):
    if A is None or B is None or len(A) == 0 or len(B) == 0:
        return 0.0
    setA = tokenize(A)
    setB = tokenize(B)
    num_intersection = setA.intersection(setB)
    return float(intersection) / (len(setA) + len(setB) - intersection)


def overlap_levenshtein(A, B):
    try:
        if A is None or B is None or len(A) == 0 or len(B) == 0:
            return 0.0
        setA = set(tokenize(A))
        setB = set(tokenize(B))
        num_intersection = len(setA.intersection(setB))
        max_dist = 0
        for wordA in setA:
            max_dist = 0
            for wordB in setB:
                if wordA not in list(
                        setA.intersection(setB)) and wordB not in list(
                            setA.intersection(setB)):
                    dist = levenshtein_distance_metric(wordA, wordB)
                    if dist > max_dist and dist > 0.75:
                        max_dist = dist
            num_intersection = num_intersection + max_dist
        min_len = len(setA)
        if (min_len > len(setB)):
            min_len = len(setB)
        if min_len == 0:
            return 0.0
        return float(num_intersection) / min_len
    except:
        pass


def jaccard_levenshtein(A, B):
    if A is None or B is None or len(A) == 0 or len(B) == 0:
        return 0.0
    setA = tokenize(A)
    setB = tokenize(B)
    num_intersection = len(setA.intersection(setB))
    for wordA in setA:
        max_dist = 0
        for wordB in setB:
            if wordB not in setA.intersection(setB):
                dist = levenshtein_distance_metric(wordA, wordB)
                if dist > max_dist and max_dist > 0.75:
                    max_dist = dist
            num_intersection = num_intersection + max_dist
    return float(num_intersection) / (len(setA) + len(setB) - num_intersection)


def levenshtein_distance_metric(A, B):
    try:
        if A is None or B is None or len(A) == 0 or len(B) == 0:
            return 0.0
        max_len = len(A)
        if max_len < len(B):
            max_len = len(B)
        return 1 - float(levenshtein_distance(A, B)) / max_len
    except:
        pass


def jaro_winkler_metric(A, B):
    if A is None or B is None or len(A) == 0 or len(B) == 0:
        return 0.0

    return float(jaro_winkler(A.lower(), B.lower()))


def exact_metric(A, B):
    if A is None or B is None:
        return 0.0
    if A == B:
        return 0.25
    else:
        return 0.0


def overlap_levenshtein(A, B):
    try:
        if A is None or B is None or len(A) == 0 or len(B) == 0:
            return 0.0
        setA = set(tokenize(A))
        setB = set(tokenize(B))
        num_intersection = len(setA.intersection(setB))
        intersected_word = list(setA.intersection(setB))
        max_dist = 0
        for wordA in setA:
            max_dist = 0
            for wordB in setB:
                if wordA not in intersected_word and wordB not in intersected_word:
                    dist = levenshtein_distance_metric(wordA, wordB)
                    if dist > max_dist and dist > 0.75:
                        max_dist = dist
            num_intersection = num_intersection + max_dist
        min_len = len(setA)
        if (min_len > len(setB)):
            min_len = len(setB)
        if min_len == 0:
            return 0.0
        return float(num_intersection) / min_len
    except Exception:
        pass


def soundex_metric(A, B):
    try:
        if A is None or B is None or A is "" or B is "" or len(A) == 0 or len(
                B) == 0:
            return 0.0

        setA = tokenize(A)
        setB = tokenize(B)
        soundexA = set()
        soundexB = set()
        for wordA in setA:
            soundexA.add(soundex(wordA))

        for wordB in setB:
            soundexB.add(soundex(wordB))

        intersection = 0
        for wordA in soundexA:
            if wordA in soundexB:
                intersection = intersection + 1

        min_len = len(soundexA)
        if min_len > len(soundexB):
            min_len = len(soundexB)
        if min_len == 0.0:
            return 0.0

        return float(intersection) / (min_len)
    except:
        pass


def swapped_attribute(fir, sec, pair_of_header, func, header_index):
    combinations = [(0, 0), (0, 1), (1, 0), (1, 1)]
    metric_values = list()
    for each_combination in combinations:
        metric_values.append(
            func(fir[header_index[pair_of_header[each_combination[0]]]],
                 sec[header_index[pair_of_header[each_combination[1]]]]))
    swp_1 = metric_values[0] + metric_values[3]
    swp_2 = metric_values[1] + metric_values[2]
    if swp_1 > swp_2:
        return metric_values[0], metric_values[3]
    else:
        return metric_values[1], metric_values[2]


def applyAlgorithms(dataframe, algorithms, column_list):
    counter = 0
    column_length = len(column_list)
    for key, values_list in algorithms.items():
        values_list.sort()
        if key == LEVENSHTEIN:
            for value in values_list:
                column_name_list = [
                    column_name for column_name in column_list
                    if value in column_name
                ]
                col_name = value + "_" + str(column_length + counter)
                df[col_name] = df.apply(
                    lambda row: levenshtein_distance_metric(
                        row['%s' % column_name_list[0]], row[
                            '%s' % column_name_list[1]]),
                    axis=1)
                # dataframe = dataframe.withColumn("feature_"+str(column_length+counter), apply_levenshtein(column_name_list[0], column_name_list[1]))
                counter = counter + 1

        elif key == SOUNDEX:
            for value in values_list:
                column_name_list = [
                    column_name for column_name in column_list
                    if value in column_name
                ]
                col_name = value + "_" + str(column_length + counter)
                df[col_name] = df.apply(
                    lambda row: soundex_metric(row['%s' % column_name_list[
                        0]], row['%s' % column_name_list[1]]),
                    axis=1)
                # dataframe = dataframe.withColumn("feature_"+str(column_length+counter), apply_soundex(column_name_list[0], column_name_list[1]))
                counter = counter + 1

        elif key == OVERLAP:
            for value in values_list:
                column_name_list = [
                    column_name for column_name in column_list
                    if value in column_name
                ]
                col_name = value + "_" + str(column_length + counter)
                df[col_name] = df.apply(
                    lambda row: overlap(row['%s' % column_name_list[0]], row[
                        '%s' % column_name_list[1]]),
                    axis=1)
                # dataframe = dataframe.withColumn("feature_"+str(column_length+counter), apply_overlap(column_name_list[0], column_name_list[1]))
                counter = counter + 1

        elif key == OVERLAPLEVENSHTEIN:
            for value in values_list:
                column_name_list = [
                    column_name for column_name in column_list
                    if value in column_name
                ]
                col_name = value + "_" + str(column_length + counter)
                df[col_name] = df.apply(lambda row: overlap_levenshtein(
                    row['%s' % column_name_list[0]], row['%s' %
                                                         column_name_list[1]]),
                                        axis=1)
                # dataframe = dataframe.withColumn("feature_"+str(column_length+counter), apply_overlap_levenshtein(column_name_list[0], column_name_list[1]))
                counter = counter + 1

        elif key == JACCARD:
            for value in values_list:
                column_name_list = [
                    column_name for column_name in column_list
                    if value in column_name
                ]
                col_name = value + "_" + str(column_length + counter)
                df[col_name] = df.apply(
                    lambda row: jaccard(row['%s' % column_name_list[0]], row[
                        '%s' % column_name_list[1]]),
                    axis=1)
                # dataframe = dataframe.withColumn("feature_"+str(column_length+counter), apply_jaccard(column_name_list[0], column_name_list[1]))
                counter = counter + 1

        if key == JACCARDLEVENSHTEIN:
            for value in values_list:
                column_name_list = [
                    column_name for column_name in column_list
                    if value in column_name
                ]
                col_name = value + "_" + str(column_length + counter)
                df[col_name] = df.apply(lambda row: jaccard_levenshtein(
                    row['%s' % column_name_list[0]], row['%s' %
                                                         column_name_list[1]]),
                                        axis=1)
                # dataframe = dataframe.withColumn("feature_"+str(column_length+counter), apply_jaccard_levenshtein(column_name_list[0], column_name_list[1]))
                counter = counter + 1

    return dataframe, counter

# **Processing**

In [9]:
bb = applyAlgorithms(dataframe=df, algorithms=algos, column_list=cols)

df = bb[0]

train_df = df[[
    'system_address_14', 'system_alternatename_15', 'system_businessname_16',
    'system_phonenumber_17', 'system_universalbusinessnumber_18',
    'system_alternatename_19', 'system_businessname_20', 'LABEL'
]]

target = train_df.pop('LABEL')

dataset = tf.data.Dataset.from_tensor_slices((train_df.values, target.values))

# **Model Building**

In [13]:
from keras.callbacks import TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

Using TensorFlow backend.


In [14]:
train_dataset = dataset.shuffle(len(df)).batch(1)
tf.keras.backend.set_floatx('float64')

In [20]:
class NeuralNet():
    
    def get_trained_model(self):
        model = Sequential()
        model.add(Dense(10, activation='relu', input_shape=(7, )))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='softmax'))

        logdir = "/home/sachin/Desktop/logs" + datetime.datetime.now(
        ).strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = TensorBoard(log_dir=logdir)

        model.compile(optimizer=RMSprop(learning_rate=0.01),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        logdir = "/home/sachin/Desktop/logs" + datetime.datetime.now(
        ).strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = TensorBoard(log_dir=logdir)

        history = model.fit(train_dataset,
                            epochs=1,
                            verbose=1,
                            callbacks=[tensorboard_callback])

        return history, model

    def __init__(self):
        pass

net = NeuralNet()
history, model = net.get_trained_model()

Train for 155901 steps


In [None]:
from tensorboard import notebook

In [None]:
%tensorboard --logdir logs

In [65]:
MODEL_STORAGE = "/home/sachin/Desktop/Model/"
model.save(MODEL_STORAGE+"model_2.h5")
model.save_weights(MODEL_STORAGE+"model_2_weights.h5")
model_json= model.to_json()
with open(MODEL_STORAGE+"model_2.json", "w") as json_file:
    json_file.write(model_json)

In [0]:
import matplotlib.pyplot as plt
loss = history.history['acc']
val_loss = history.history['val_acc']

epochs = range(len(acc))
plt.figure(figsize=(17, 10))
plt.plot(epochs, loss, 'r', label='Training accuracy')
plt.plot(epochs, val_loss, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc=0)
plt.show()

# **TensorFlow Dataset API for GPU and TPU processing**

<br> 
**Load Data frame and Make Tensor**

In [0]:
data_slices = tf.data.Dataset.from_tensor_slices(dict(df))
for feature_batch in data_slices.take(1):
    for key, value in feature_batch.items():
        print("  {!r:20s}: {}".format(key, value))

**Load CSV data and create a tensor**

In [0]:
def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=5,  #Artificially small to make examples easier to show.
        label_name="LABEL",
        na_value="?",
        num_epochs=1,
        ignore_errors=True,
        **kwargs)
    return dataset


raw_train_data = get_dataset("/content/BigData.csv")

In [0]:
def show_batch(dataset):
    for batch, label in dataset.take(1):
        for key, value in batch.items():
            print("{:20s}: {}".format(key, value.numpy()))


show_batch(raw_train_data)

<br>

In [0]:
processed_df, counter = applyAlgorithms(dataframe=df,algorithms=algos,column_list=cols)

In [66]:
class PlotLearning(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.x = []
        self.losses = []
        self.val_losses = []
        self.acc = []
        self.val_acc = []
        self.fig = plt.figure()
        
        self.logs = []

    def on_epoch_end(self, epoch, logs={}):
        
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.acc.append(logs.get('acc'))
        self.val_acc.append(logs.get('val_acc'))
        self.i += 1
        f, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
        
        clear_output(wait=True)
        
        ax1.set_yscale('log')
        ax1.plot(self.x, self.losses, label="loss")
        ax1.plot(self.x, self.val_losses, label="val_loss")
        ax1.legend()
        
        ax2.plot(self.x, self.acc, label="accuracy")
        ax2.plot(self.x, self.val_acc, label="validation accuracy")
        ax2.legend()
        
        plt.show();
        
plot = PlotLearning()