# Imports

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
pd.set_option('display.max_columns', None)
import timeit
pd.set_option('display.max_rows',None)
import re
import datetime
import time
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from functools import wraps
import math
import random

# Execution time

In [3]:
#takes a single argument for the function to be decorated
def timeit(func):
    """
    Decorator function to measure the execution time of a wrapped function.

    This decorator wraps the given function and measures the time taken by the function to execute using the
    `time.perf_counter()` function from the Python `time` module. It prints the execution time in seconds and returns
    the result of the wrapped function.

    Args:
        func (callable): The function to be wrapped and timed.

    Returns:
        callable: The wrapped function.

    Example:
        @timeit
        def my_function():
            # Code to be timed
            # ...

        my_function()  # This will print the execution time of my_function
    """
    #its used to preserve the original functions metadata
    @wraps(func)
    def wrapper(*args, **kwargs):
        #time.perf_counter() function from the Python time module to measure the time taken by the wrapped function to execute.
        start = time.perf_counter()
        result = func(*args, **kwargs)
        #end time
        end = time.perf_counter()
        #final
        f=open("output.txt","a")
        print(f'{func.__name__} took {end - start:.6f} seconds to complete',file=f)
        f.close()
        return result
    return wrapper

In [4]:

'''Overall, converting the data to tensors and performing preprocessing operations
 can help reduce memory occupancy and enable more efficient memory management when working with large datasets.'''

'Overall, converting the data to tensors and performing preprocessing operations\n can help reduce memory occupancy and enable more efficient memory management when working with large datasets.'

# Data Load

In [6]:
def load_data():
  """
  Create a TensorFlow Dataset from a Pandas DataFrame.

  This function takes a Pandas DataFrame and converts it into a TensorFlow Dataset using the `from_tensor_slices`
  method. The DataFrame is converted into a dictionary, where the keys represent column names and the values represent
  the corresponding data arrays or Series.

  Args:
      df (pandas.DataFrame): The input DataFrame to convert into a TensorFlow Dataset.

  Returns:
      tf.data.Dataset: The TensorFlow Dataset created from the input DataFrame.

  Example:
      import pandas as pd
      import tensorflow as tf

      # Create a Pandas DataFrame
      df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

      # Create TensorFlow Dataset from DataFrame
      dataset = create_dataset_from_dataframe(df)
  """
  df=pd.read_csv("Synthetic_Data_Address_clustering_sample (1).csv")

  df_tf=tf.data.Dataset.from_tensor_slices(dict(df))

  #tensorflow datasets are good for large scale data, shuffling, batching and pre-fetching
  return df_tf


In [7]:
df_tf=load_data()

# Applying Standardization(lowercase, extra spaces, null vales, duplicates)

In [9]:
def ascii_sum(dataset):

    """
    Function to calculate the ASCII sum of the 'full_address' column in a TensorFlow dataset.

    Parameters:
        dataset (tf.data.Dataset): Input TensorFlow dataset.

    Return:
        tf.data.Dataset: A new TensorFlow dataset with the 'ascii_sum' column added.
    """
    row = dataset.copy()

    # Convert the 'full_address' string to Unicode code points
    unicode_code_points = tf.strings.unicode_decode(row['Delivery_Desc'], 'UTF-8')

    # Calculate the sum of Unicode code points to get the ASCII sum
    ascii_sum_value = tf.reduce_sum(unicode_code_points)

    # Update the row dictionary with the 'ascii_sum' column
    row['ascii_sum'] = ascii_sum_value

    return row   

In [10]:
@timeit
def remove_duplicates(s):
    words = tf.strings.reduce_join(s, separator=', ')
    words = tf.strings.split(words, sep=', ')
    _, idx = tf.unique(words)
    unique_words = tf.gather(words, idx)
    return tf.strings.reduce_join(unique_words, separator=', ')


In [11]:
#execution time for this function will be computed


#to enable graph mode(eager tensor is by default there)-----reduce_retracing, experimental_relax_shapes----experimental purposes
# @tf.function(reduce_retracing=True,experimental_relax_shapes=False)

#this function takes inputs as each element(row) in a tensor and returns the processed result
@timeit
def standardization(element):
    """
      Standardize the 'Delivery_Desc' column of the input element.

      This function performs standardization operations on the 'Delivery_Desc' column of the input element using TensorFlow
      operations. The operations include converting the text to lowercase, removing special characters, removing the word
      'null' (if present), removing extra white spaces, removing duplicates, and rejoining the words into a single string.

      Args:
          element (dict): The input element containing the 'Delivery_Desc' column.

      Returns:
          dict: The standardized element with the 'Delivery_Desc' column processed.

      Example:
          import tensorflow as tf

          # Define an input element
          element = {'Delivery_Desc': 'This is a Sample Description.'}

          # Standardize the input element
          standardized_element = standardization(element)
    """
    element_copy = element.copy()
    #converting to lower case for each element
    element_copy['Delivery_Desc'] = tf.strings.lower(element_copy['Delivery_Desc'])

    #removing all special characters(not from a-z or 0-9)
    element_copy['Delivery_Desc']=tf.strings.regex_replace(element_copy['Delivery_Desc'], r"[^a-zA-Z0-9]", " ")

    #removing words 'null' or 'none'(but no 'none' word was found in the dataset)---\b represents boundary
    element_copy['Delivery_Desc']=tf.strings.regex_replace(element_copy['Delivery_Desc'],r"\bnull\b"," ")

    #removing extra white spaces( \s+ for a space after a white space)
    element_copy['Delivery_Desc']=tf.strings.regex_replace(element_copy['Delivery_Desc'],r"\s+"," ")

    #stripping white spaces before and after the string
    element_copy['Delivery_Desc'] = tf.strings.strip(element_copy['Delivery_Desc'])

    #removing duplicates--->splitting the words individually converts it to a 'ragged tensor'(variable length)
    element_copy['Delivery_Desc'] = tf.strings.split(element_copy['Delivery_Desc'], sep=' ')

    element_copy['Delivery_Desc'] = remove_duplicates(element_copy['Delivery_Desc'])
    element_copy['Delivery_Desc'] = tf.strings.reduce_join(tf.unique(tf.strings.split(element_copy['Delivery_Desc'], sep=' '))[0], axis=-1, separator=' ')

    return element_copy

In [18]:
#eliminate short words of less than 20 characters
@timeit
def eliminate_short_words(element):
    """
    Eliminate short words from the 'Delivery_Desc' column of the input element.

    This function removes short words from the 'Delivery_Desc' column of the input element. It checks the length of each
    word in the 'Delivery_Desc' column and retains only the words that have a length greater than or equal to 20
    characters.

    Args:
        element (dict): The input element containing the 'Delivery_Desc' column.

    Returns:
        tf.Tensor: A boolean tensor representing the words with length greater than or equal to 20 characters.

    Example:
        import tensorflow as tf

        # Define an input element
        element = {'Delivery_Desc': 'This is a Sample Description with some long words.'}

        # Eliminate short words from the input element
        long_words = eliminate_short_words(element)
        print(long_words)
        # Output: [False, False, False, False, False, True, True, True, True]
    """
    #passing in elements greater than 20 characters which later is retained
    words=tf.strings.length(element['Delivery_Desc'])>=20
    return words

In [12]:
bow=pd.read_csv("Bag_of_words.csv")



# Standardization with bag of words


In [13]:
# Create combinations for regex
#?< means check in negative direction,,, ![\w\d] check if something is not a word or a digit
# Create combinations for regex
starting = r"\b(?:"
middle = r"|"
ending = r")\b"

#storing all the similar words in corresponding variables in the form of strings
house_number=np.array(bow['word2'][1:40])
ward_number=np.array(bow['word4'][1:25])
flat_no=np.array(bow['word6'][1:8])
plot_no=np.array(bow['word8'][1:7])
room_no=np.array(bow['word10'][1:19])
door_no=np.array(bow['word12'][1:12])
quarter_no=np.array(bow['word14'][1:18])
apartment_no=np.array(bow['word16'][1:20])
duplex_no=np.array(bow['word18'][1:14])
coop_list=np.array(bow['word20'][1:18])
society_list=np.array(bow['word22'][1:24])
road=np.array(bow['word24'][1:31])
street_no=np.array(bow['word26'][1:21])
nagar=np.array(bow['word28'][1:15])
behind=np.array(bow['word30'][1:26])
before=np.array(bow['word32'][1:17])
near=np.array(bow['word34'][1:36])
next_to=np.array(bow['word36'][1:26])
opposite=np.array(bow['word38'][1:31])
front=np.array(bow['word40'][1:21])
after=np.array(bow['word42'][1:18])

# Creating regex string for all combinations of stop words, checking for boundaries(spaces before the word or after the word)

house_number = starting + middle.join(house_number) + ending
ward_number = starting + middle.join(ward_number) + ending
flat_no = starting + middle.join(flat_no) + ending
plot_no = starting + middle.join(plot_no) + ending
room_no = starting + middle.join(room_no) + ending
door_no = starting + middle.join(door_no) + ending
quarter_no = starting + middle.join(quarter_no) + ending
apartment_no = starting + middle.join(apartment_no) + ending
duplex_no = starting + middle.join(duplex_no) + ending
coop_list = starting + middle.join(coop_list) + ending
society_list = starting + middle.join(society_list) + ending
road= starting + middle.join(road) + ending
street_no= starting + middle.join(street_no) + ending
nagar = starting + middle.join(nagar) + ending
behind = starting + middle.join(behind) + ending
before = starting + middle.join(before) + ending
near = starting + middle.join(near) + ending
next_to = starting + middle.join(next_to) + ending
opposite = starting + middle.join(opposite) + ending
front = starting + middle.join(front) + ending
after = starting + middle.join(after) + ending


In [14]:
#Calculating the time for this function

#this function replaces all the similar words with bag of words
@timeit
def replace_strings(tensor_dataset):
    row = tensor_dataset.copy()
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], house_number, ' house ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], ward_number, ' ward ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], flat_no, ' flat ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], plot_no, ' plot ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], room_no, ' room ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], door_no, ' door ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], quarter_no, ' quarter ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], apartment_no, ' apartment ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], duplex_no, ' duplex ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], coop_list, ' coop ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], society_list, ' society ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], road, ' road ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], street_no, ' street ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], nagar, ' nagar ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], behind, ' behind ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], before, ' before ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], near, ' near ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], next_to, ' next ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], opposite, ' opposite ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], front, ' front ')
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], after, ' after ')

    return row

In [15]:
# function to remove words like - house, flat, room, etc
@timeit
def remove_stop_words(element):
    row=element.copy()
    row['Delivery_Desc'] = tf.strings.regex_replace(row['Delivery_Desc'], r"\b(?:house|ward|flat|plot|room|door|quarter|apartment|duplex|coop|society|road|street|nagar|behind|before|near|next|opposite|front|after)\b", "")
    return row

# Main Function to call all Functions

In [19]:
def main():
      # creating tensorflow dataset from csv
      df_tf = load_data()

      df_tf = df_tf.map(standardization)

      #Removing rows with only numbers
      df_tf = df_tf.filter(lambda x: tf.math.logical_not(tf.strings.regex_full_match(x['Delivery_Desc'], r'^[0-9]+$')))

      #filtering the dataset by keeping the retained words
      df_tf=df_tf.filter(eliminate_short_words)

      #implementing bag of wors standardization
      df_tf = df_tf.map(replace_strings)
      df_tf = df_tf.map(remove_stop_words)

      return df_tf

# Calling Main Function

In [20]:
df_tf = main()

In [None]:
for i in df_tf:
    print(i)

## Converting it back to a pandas dataframe due to drawbacks(please navigate to bge.ipynb for further results)

In [41]:
#converting tf dataset to pandas dataframe
import pandas as pd
df = pd.DataFrame(df_tf.as_numpy_iterator())
df.to_csv('output.csv', index=False)

# Hypothetically if the sorting was possible in tensorflow dataset:

In [28]:
def cosine_similarity(x, y):
    # Ensure length of x and y are the same
    if len(x) != len(y) :
        return None
    
    # Compute the dot product between x and y
    dot_product = np.dot(x, y)
    
    # Compute the L2 norms (magnitudes) of x and y
    magnitude_x = np.sqrt(np.sum(x**2))
    magnitude_y = np.sqrt(np.sum(y**2))
    
    # Compute the cosine similarity
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)
    
    return cosine_similarity

In [29]:
def euclidean_distance(vector1, vector2):
    if len(vector1) != len(vector2):
        raise ValueError("Vectors must have the same dimension")
    
    squared_sum = sum((x - y)**2 for x, y in zip(vector1, vector2))
    return math.sqrt(squared_sum)

### Vectorization(tf-idf)

#### Extracting all the words from vocab.txt

In [30]:
import re
import nltk
from nltk.corpus import stopwords

# Download the set of stopwords if not already present
nltk.download('stopwords')

def filter_unique_english_words(input_file, output_file):
    # Regular expression pattern to match English words and numbers
    pattern = re.compile(r'[a-zA-Z0-9]+')

    # Get the set of English stop words
    stop_words = set(stopwords.words('english'))

    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        unique_words = set()
        for line in infile:
            # Find all matches of the pattern in the line
            matches = pattern.findall(line)
            # Convert each word to lower case and add to unique_words set if it's not a stop word
            for word in matches:
                word_lower = word.lower()
                if word_lower not in stop_words:
                    unique_words.add(word_lower)
        
        # Write unique words to the output file
        for word in unique_words:
            outfile.write(word + '\n')

if __name__ == "__main__":
    input_file = "vocab.txt"
    output_file = "refactored.txt"
    
    filter_unique_english_words(input_file, output_file)
    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\freed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Function for tf-idf

In [37]:
def Vectorization_tfidf(df_tf):
  """
    Vectorizes the text data in the DataFrame using TF-IDF (Term Frequency-Inverse Document Frequency) representation.

    Parameters:
        df_tf (tensorflow dataset): The input DataFrame containing the 'Delivery_Desc' column to be vectorized.

    Returns:
        numpy.ndarray: A 2D numpy array representing the vectorized text data with TF-IDF scores for each token.

    This function preprocesses the text data using TensorFlow's TextVectorization layer with output_mode='tf_idf'.
    It creates a vocabulary containing the top 'max_features' most frequent tokens from the input texts, and less frequent tokens are discarded.
    Each text sequence is truncated or padded to a maximum length of 'max_len' tokens to ensure uniformity in the input data.
    The text data is then vectorized using the TF-IDF method, assigning numerical values to tokens based on their importance
    relative to the entire dataset.

    
    The resulting vectorized data can be used as input for machine learning models in various NLP tasks.
  """
  #The value 1000 here means that the vocabulary will
  # contain the top 1000 most frequent tokens from the input texts. Less frequent tokens will be discarded.
  max_features=1000
  max_len=10
  # Create the layer.
  vectorize_layer = tf.keras.layers.TextVectorization(
  max_tokens=max_features,
  output_mode='tf_idf',)
  #extracting the delivery desc column
  delivery_desc_dataset=df_tf.map(lambda x:x['Delivery_Desc'])
  #reshaping the tensors so that it fits in the input layer of the sequential model
  reshaped_dataset = delivery_desc_dataset.map(lambda x: tf.expand_dims(x, 0))
  #extract all unique words from all the tokens in batches of size 64
  vectorize_layer.adapt(delivery_desc_dataset.batch(64))
  #viewing all the bag of unique words
  vectorize_layer.get_vocabulary()
  #creating a model 'Sequential' which sequentially executes steps in layers
  model = tf.keras.models.Sequential()
  #creating an input layer to input each text
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  #creating a layer that assigns numbers to each token based on tf-idf method
  model.add(vectorize_layer)
  #this is used to make the vectors
  vectorized_layer=model.predict(reshaped_dataset)
  return vectorized_layer

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
def calculate_idf_weights(texts, max_features):
    # Create and fit TfidfVectorizer to calculate IDF weights
    vectorizer = TfidfVectorizer(max_features=max_features,use_idf=True)
    vectorizer.fit(texts)
    idf_weights = vectorizer.idf_
    idf_weights=np.pad(idf_weights,(14,),'mean')

    return idf_weights

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
def Vectorization_custom(df_tf,custom_bag_words):
  """
    Vectorizes the text data in the DataFrame using TF-IDF (Term Frequency-Inverse Document Frequency) representation.

    Parameters:
        df_tf (tensorflow dataset): The input DataFrame containing the 'Delivery_Desc' column to be vectorized.

    Returns:
        numpy.ndarray: A 2D numpy array representing the vectorized text data with TF-IDF scores for each token.

    This function preprocesses the text data using TensorFlow's TextVectorization layer with output_mode='tf_idf'.
    It creates a vocabulary containing the top 'max_features' most frequent tokens from the input texts, and less frequent tokens are discarded.
    Each text sequence is truncated or padded to a maximum length of 'max_len' tokens to ensure uniformity in the input data.
    The text data is then vectorized using the TF-IDF method, assigning numerical values to tokens based on their importance
    relative to the entire dataset.

    
    The resulting vectorized data can be used as input for machine learning models in various NLP tasks.
  """
  #The value 30438 here means that the vocabulary will
  # contain the top 30438 most frequent tokens from the input texts. Less frequent tokens will be discarded.
  max_features=30438
  # Create the layer.
  vectorizer=TfidfVectorizer(max_features=max_features)
  vectorizer.fit(custom_bag_words)
  idf_weights = calculate_idf_weights(custom_bag_words, max_features)
  vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='tf_idf',
    vocabulary=custom_bag_words,
    idf_weights = idf_weights
  )
  #extracting the delivery desc column
  delivery_desc_dataset=df_tf.map(lambda x:x['Delivery_Desc'])
  #reshaping the tensors so that it fits in the input layer of the sequential model
  reshaped_dataset = delivery_desc_dataset.map(lambda x: tf.expand_dims(x, 0))
  #creating a model 'Sequential' which sequentially executes steps in layers
  model = tf.keras.models.Sequential()
  #creating an input layer to input each text
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  #creating a layer that assigns numbers to each token based on tf-idf method
  model.add(vectorize_layer)
  #this is used to make the vectors
  vectorized_layer=model.predict(reshaped_dataset)
  return vectorized_layer

In [34]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read().splitlines()

texts=read_file('refactored.txt')

In [38]:
# Vectorize the text data using TF-IDF representation
vectorized_layer=Vectorization_tfidf(df_tf)



In [None]:
#Vectorize using custom bag of words
vectorized_layer_custom=Vectorization_custom(df_tf,texts)

### Clustering with vectors

In [39]:
#creating a pandas dataframe with 4 columns (cluster_id, customer_id, address, pin code)
df = pd.DataFrame(columns=['cluster_id', 'CUSTOMER_ID', 'Delivery_Desc', 'PIN_CODE'])

In [40]:
# data_clustering is a function which performs the clustering algorithm and returns the dataframe containing the n or more sized cluster addresses
def clustering_vectors(df_tf,threshold,df):
    cluster_id=[]
    c_id=1
    customer_id = (list(df_tf.take(1))[0])['CUSTOMER_ID'].numpy()
    address = (list(df_tf.take(1))[0])['Delivery_Desc'].numpy()
    pin_code = (list(df_tf.take(1))[0])['PIN_CODE'].numpy()
    cluster_id.append([[c_id,customer_id,address,pin_code]])
    #appending this to pandas dataframe df
    df.loc[0]=[c_id,str(customer_id),str(address),str(pin_code)]
    for i in range(1,len(df_tf)):
        customer_id = (list(df_tf.take(i+1))[i])['CUSTOMER_ID'].numpy()
        address = (list(df_tf.take(i+1))[i])['Delivery_Desc'].numpy()
        pin_code = (list(df_tf.take(i+1))[i])['PIN_CODE'].numpy()
        #comparing each address with the last address of the last cluster using cosine similarity
        
        if (euclidean_distance(vectorized_layer[i],vectorized_layer[i-1])>threshold):
            cluster_id[-1].append([[c_id,str(customer_id),str(address),str(pin_code)]])
        #if the address is not similar to the last address of the last cluster then it is added to a new cluster
        else:
            c_id+=1
            cluster_id.append([[c_id,str(customer_id),str(address),str(pin_code)]])

    #removing all clusters that have length less than 2 clusters
    for i in range(len(cluster_id)):
        if len(cluster_id[i])<2:
            cluster_id[i]=[]
    
    
    return cluster_id

    

In [None]:
cluster=clustering_vectors(df_tf,0.95,df)

In [None]:
cluster