# Seedtag codetest: NLP Researcher

## Part 3. Message-matcher baseline model
This communication contains a message matcher baseline model. Given a query text message and a corpus of historical messages, this matcher model retrieves all historical messages that are similar to the queried one. Your goal is to improve this model.

In [1]:
import os
from hashlib import md5
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#New imports
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


Necessary Installations:

! pip install nltk sentence-transformers

nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('wordnet') 

### 0. Auxiliary Functions

In [2]:
def create_df(path, tag):
    '''
    Creates a data frame for a given class
    --------------------------------------
    Input:
        path (str): path where all classes folders are stored.
        tag (str): name of the folder containing class "tag".
    Output:
        df (pd.DataFrame): dataframe with file as index and columns=[text, tag]
    '''
    list_of_text = []
    tag_dir = os.path.join(path, tag)
    for file in os.listdir(tag_dir):

        with open(os.path.join(tag_dir, file), encoding="utf-8", errors="ignore") as f:
            text = f.read()
            list_of_text.append((text, file))
            df = pd.DataFrame(list_of_text, columns = ['Text', 'file'])
            df = df.set_index('file')
    df['tag'] = tag
    return df


def get_all_dfs(path, tags):
    '''
    Loops over all classes in path, each in the corresponding folder
    --------------------------------
    Input:
        path (str): path where all classes folders are stored.
        tags (list): list of classes names.
    Output:
        df (pd.DataFrame): pandas dataframe with the dataframes corresponding to all classes concatenated.
    '''
    list_of_dfs = []
    for tag in tags:

        df = create_df(path, tag)
        list_of_dfs.append(df)
    data = pd.concat(list_of_dfs)
    return data


def to_md5(rsc_id: str) -> str:
    """
    Convert rcs_id string into a hexdigest md5.
    :param rcs_id: str.
    :return: hexdigext representation of md5 codification of input string.
    """
    md5_rsc = bytes(rsc_id, 'utf-8')
    result_1 = md5(md5_rsc)
    return result_1.hexdigest()


def get_similarity(resources: pd.DataFrame, space: str = 'tfidf', max_df: float = .75) -> np.array:
    """
    Compute pairwise cosine similarity for resources in a given vector representation (tf or tfidf).
    :param resources: pd.DataFrame with the resources as rows and at least 'Text' as column.
    :param space: vector space representation of resources, either 'tf' or 'tfidf'.
    :param max_df: maximum valur for document frequency just as in sklearn Vectorizers.
    :return: symmetric np.array with cosine similarity score for each resource pair.
    """
    if space == 'tf':
        vec = CountVectorizer(min_df=2, max_df=max_df)
    elif space == 'tfidf':
        vec = TfidfVectorizer(min_df=2, max_df=max_df)
    else:
        print('The "space" input must be either "tf" or "tfidf", using the default "tfidf" option...')
        vec = TfidfVectorizer(min_df=2, max_df=max_df)
    vec_res = vec.fit_transform(resources['Text'].fillna(''))
    sims = cosine_similarity(vec_res, vec_res)
    return sims


def find_similar_rsc(similarity_scores: np.array, threshold: float) -> pd.DataFrame:
    """
    Get a dictionary relating resources to a list of [resource, score] pairs per resource.
    :param similarity_scores: matrix of similarity score per pair of resources of shape
    (number of resoures, number of resources).
    :param threshold: the similarity score threshold for retrieving as similar resource.
    :return: a pd.DataFrame with 'resource_idx', 'similar_res_idx' and 'similarity_score' as columns relating resources
    to a given resource.
    """
    similar_rsc_idx = np.where((similarity_scores >= threshold) & (similarity_scores < 0.999))
    similar_scores = np.round(similarity_scores[similar_rsc_idx], 3)
    sim_res = pd.DataFrame({'resource_idx': similar_rsc_idx[0],
                            'similar_res_idx': similar_rsc_idx[1],
                            'similarity_score': similar_scores})
    return sim_res


def get_similar_rsc(resources: pd.DataFrame, threshold: float = 0.75, space: str = 'tfidf') -> dict:
    """
    Get similar resources per resource.
    :param resources: pd.DataFrame with the resources as rows and at least 'Text' as column.
    :param threshold: the similarity score threshold for retrieving as similar resource.
    :param space: vector space representation of resources, either 'tf' or 'tfidf'.
    :return: a dictionary with resources as keys and similar resources as values.
    """
    sims = get_similarity(resources, space)
    find_sims = find_similar_rsc(sims, threshold)
    sim_df = find_sims.copy()
    sim_df.reset_index(inplace=True)
    sim_df['resource_id'] = resources['resource_id'].iloc[find_sims.resource_idx].values
    sim_df['similar_res'] = resources['resource_id'].iloc[find_sims.similar_res_idx].values
    sim_df['sim_resources'] = sim_df.apply(lambda x: [[x.similar_res, x.similarity_score]], axis=1)
    grouped_sim_res = sim_df[['resource_id', 'sim_resources']].groupby('resource_id').agg(lambda x: np.sum(x))
    similar_res_dict = grouped_sim_res.T.to_dict('records')[0]
    sim_res = {k: sorted(v, key=lambda x: x[1], reverse=True) for k, v in similar_res_dict.items()}
    return sim_res


def get_similar(input_text: str, corpus: pd.DataFrame, threshold: float=0.75, space: str = 'tfidf') -> list:
    """
    Retrieves a set of messages from a given corpus that are similar enough to an input message.
    :param input_text: query text.
    :param corpus: pd.DataFrame with historical messages as column 'Text'.
    :param threshold: the similarity score threshold for retrieving as similar resource.
    :param space: vector space representation of resources, either 'tf' or 'tfidf'.
    :return: a list with all the similar messages content and corresponding score to the queried one.
    """
    input_id = to_md5(input_text)
    input_df = pd.DataFrame({'Text': [input_text], 'resource_id': [input_id]})
    data = pd.concat([input_df, corpus])
    sim_dict = get_similar_rsc(data, threshold, space)
    result = list()
    if sim_dict.get(input_id):
        for sim_id, sim_score in sim_dict.get(input_id):
            result.append([corpus['Text'][corpus['resource_id'] == sim_id].values[0], sim_score])
    else:
        result = [None, 0]
    return result

### Extra functions

In [3]:
def clean_text(text):
    """
    Clean and preprocess text by normalizing, removing noise, and lemmatizing.
    Metadata, email addresses, URLs, punctuation, and non-ASCII characters are eliminated.

    Parameters:
    ----------
    text : str
        Input text string to be cleaned.

    Returns:
    --------
    cleaned_text_basic : str
        Text cleaned with basic preprocessing (no lemmatization or stopword removal).
    cleaned_text_advanced : str
        Text cleaned with additional lemmatization and stopword removal.
    """
    text = text.lower()  
    text = re.sub(r'article-i\.d\.: [^\s]+', '', text)  # article ID references
    text = re.sub(r'\S+@\S+', '', text)   # remove email adresses
    text = re.sub(r'http[s]?://\S+', '', text)   # remove urls
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # non-ASCII characters (e.g., emojis, special characters, foreign language symbols)
    text = re.sub(r'\s+', ' ', text).strip()  # extra whitespaces
    
    cleaned_text_basic = text
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    cleaned_text_advanced = ' '.join(tokens)
    
    return cleaned_text_basic, cleaned_text_advanced

In [4]:
def precompute_embeddings(dataframe, column_name, model_name):
    """
    Precompute embeddings for a DataFrame column using a pre-trained embedding model.
    
    Parameters:
    ----------
    dataframe : pd.DataFrame
        The input DataFrame containing the text data.
    column_name : str
        The column name in the DataFrame containing text to embed.
    model_name : str
        The pre-trained SentenceTransformer model to use for generating embeddings.
        
    Returns:
    --------
    list
        A list of embeddings, where each embedding corresponds to the text in the specified column.
    
    Explanation:
    ------------
    """
    model = SentenceTransformer(model_name)
    
    embeddings = model.encode(dataframe[column_name])
    embeddings=list(embeddings)
    
    return embeddings


In [5]:
def find_similarity_between_datasets(corpus_df, test_df, resource_id_col, embeddings_col, percentile=90):   
    """
    Compare similarity between test data embeddings and corpus embeddings, and retrieve top results.

    Parameters:
    ----------
    corpus_df : pd.DataFrame
        DataFrame containing the corpus data.
    test_df : pd.DataFrame
        DataFrame containing the test data.
    resource_id_col : str, optional
        Name of the column in both DataFrames that contains the resource IDs 
    embeddings_col : str, optional
        Name of the column in both DataFrames that contains the embeddings 
    percentile : int, optional
        Percentile to use for dynamic thresholding (default: 90).

    Returns:
    --------
    results : dict
        A dictionary where each test resource ID maps to:
        - The 5 most similar resource IDs from the corpus.
        - Mean similarity score.
        - Dynamic threshold (percentile value).
    """
    results = {}
                                       
    corpus_embeddings = np.array(corpus_df[embeddings_col].tolist())  # Convert embeddings to a list and also the IDs
    test_embeddings = np.array(test_df[embeddings_col].tolist())
    corpus_ids = corpus_df[resource_id_col].tolist()
    test_ids = test_df[resource_id_col].tolist()

    similarities = cosine_similarity(test_embeddings, corpus_embeddings)  # Rows=test samples. Columns+ corpus samples.Cosine similarity[i,j]

    for i, test_id in enumerate(test_ids):
        similarity_scores = similarities[i]   
        mean_similarity = float(np.mean(similarity_scores)) 
        dynamic_threshold = float(np.percentile(similarity_scores, percentile)) 

        top_indices = np.argsort(similarity_scores)[::-1][:5]  
        top_similarities = similarity_scores[top_indices]
        top_ids = [corpus_ids[idx] for idx in top_indices]

        top_similarities = [round(float(score), 2) for score in top_similarities]

        results[test_id] = {'Top 5 Similar IDs': list(zip(top_ids, top_similarities)), 'Mean Similarity': round(mean_similarity, 2), 'Dynamic Threshold': round(dynamic_threshold, 2),}

    return results

### 1. Preparing data

From a given set of messages, a historical corpus and a query message are defined. Thus, the query message is fed into the message matcher so that all messages from the corpus similar to the query one are retrieved.

In [6]:
path = '../part1/dataset'
tags = os.listdir(path)
data_full = get_all_dfs(path, tags)[['Text']]
data_full['resource_id'] = data_full['Text'].apply(to_md5)

In [7]:
corpus = data_full.sample(int(data_full.shape[0] * 0.9))
test_data = data_full[~data_full.resource_id.isin(corpus.resource_id)]
print(corpus.shape)
corpus.tail()

(3467, 2)


Unnamed: 0_level_0,Text,resource_id
file,Unnamed: 1_level_1,Unnamed: 2_level_1
38757,From: r0506048@cml3 (Chun-Hung Lin)\nSubject: ...,945a8d455f889c6c65a54cb3ee10db71
176903,Article-I.D.: optilink.15236\n\nIn article <19...,070744ece06d7ca7a8f060d9003464d2
61099,"\n\nActually, the ""ether"" stuff sounded a fair...",b3367244de18f801d0bc479342104586
178320,"\n\n\n/* Written 8:33 pm Apr 14, 1993 by nln...",b2010ed0063e95567f3c03e98c84c944
103101,\nFrom article <1993Apr16.162950.25849@newsgat...,e2c2557c00ecf04566b7ba0a11b7a1fa


#### 1.1) Deeper analysis in the data

Some cleaning functions are introduced to eliminate noisy text.

In [8]:
corpus[['cleaned_text_basic', 'cleaned_text_lematization']] = corpus['Text'].apply(clean_text).apply(pd.Series)
test_data[['cleaned_text_basic', 'cleaned_text_lematization']]= test_data['Text'].apply(clean_text).apply(pd.Series)

In [9]:
# Check duplicates
corpus.resource_id.value_counts()

resource_id
9010a13c19c01163fb3da7c79e427b72    2
6b9241116df9987081fdf8fff14bc39a    2
44601ff91f80d458271573d0e76b6c21    2
314b7d4a1d82003aca2a1731ae04d80f    2
dc4f79ef1c685be926d1085495891f73    2
                                   ..
cc327617fa4826e4adb6f358cb82e79e    1
f0cd64e867985061f949dda71995c5cb    1
a4fe5a6eb67e8fe0ec1a879d76858281    1
a1d5fb3d27bdf0faa7f3275f836487cf    1
541b1a64fc80b70164f2cb7aec76fc07    1
Name: count, Length: 3453, dtype: int64

In [10]:
test_data.resource_id.value_counts()

resource_id
7bc871f4497f1516bab1eea593d76921    1
b16033e1c9ad7aeed5ef47241a004318    1
879e28912138a41aa9c4ee035af7e2f1    1
c6d389b83f7d0d2f7a12be62e3d1e974    1
9159a201997683cf3992ad8b14676c47    1
                                   ..
033acb7f6b2679deb871ce59d1df99b4    1
fbeef297172aab5165e928f706f7c587    1
23425951fe0728d71087b7ddb415a6b7    1
456a4ceb1987cb6cfa94fba21867816b    1
82e67f273aaa1097175e851cc356d870    1
Name: count, Length: 382, dtype: int64

There are duplicates in our corpus and test data, and for this use case we just want to compare with unique messages to find the most similar ones

In [11]:
corpus = corpus.drop_duplicates(subset='resource_id', keep='first')
test_data = test_data.drop_duplicates(subset='resource_id', keep='first')

### 2. Getting similar messages

In [12]:
query_text = test_data.iloc[42]['Text']
print(query_text)


In <1993Apr15.204210.26022@mksol.dseg.ti.com> pyron@skndiv.dseg.ti.com (Dillon Pyron) writes:


>There are actually only two of us.  I do Henry, Fred, Tommy and Mary.  Oh yeah,
>this isn't my real name, I'm a bald headed space baby.

Yes, and I do everyone else.  Why, you may wonder, don't I do 'Fred'?
Well, that would just be too *obvious*, wouldn't it?  Oh yeah, this
isn't my real name, either.  I'm actually Elvis.  Or maybe a lemur; I
sometimes have difficulty telling which is which.

-- 
"Insisting on perfect safety is for people who don't have the balls to live
 in the real world."   -- Mary Shafer, NASA Ames Dryden
------------------------------------------------------------------------------
Fred.McCall@dseg.ti.com - I don't speak for others and they don't speak for me.



In [13]:
similar_results = get_similar(query_text, corpus, 0.2)
if similar_results[0]:
    print("Similar Messages:")
    for result in similar_results:
        print("-"*75)
        print(result[0])
        print(f"Similarity score: {result[1]}")
        print("-"*75)

Similar Messages:
---------------------------------------------------------------------------
Article-I.D.: mojo.1qkmkiINNep3

In article <1993Apr15.204210.26022@mksol.dseg.ti.com>, pyron@skndiv.dseg.ti.com (Dillon Pyron) writes:
>
>There are actually only two of us.  I do Henry, Fred, Tommy and Mary.  Oh yeah,
>this isn't my real name, I'm a bald headed space baby.

Damn!  So it was YOU who was drinking beer with ROBERT McELWANE in the PARKING
LOT of the K-MART!

				UNLIMITED INSEMINATION OF THIS MESSAGE
					RIGIDLY REFUSED



    Software engineering? That's like military intelligence, isn't it?
  -- >                  SYSMGR@CADLAB.ENG.UMD.EDU                        < --

Similarity score: 0.61
---------------------------------------------------------------------------
---------------------------------------------------------------------------


There are actually only two of us.  I do Henry, Fred, Tommy and Mary.  Oh yeah,
this isn't my real name, I'm a bald headed space baby.
--

### 3. Using Embeddings for vectorization

-Vectorizing the text using embeddings provides a semantic understanding of the texts. 

-Using TF-IDF we will have high similarity scoring just based on exact matches in the words. With embeddings similar words will be identified and will also retrieve high similarity scores.

-With TF-IDF longer texts will dominate the similarity calculation as they have more frequent terms, using embeddings the effectiveness will be the same for long and short texts.

In [14]:
corpus.tail()

Unnamed: 0_level_0,Text,resource_id,cleaned_text_basic,cleaned_text_lematization
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
38757,From: r0506048@cml3 (Chun-Hung Lin)\nSubject: ...,945a8d455f889c6c65a54cb3ee10db71,from chunhung lin subject re jpeg file format ...,chunhung lin subject jpeg file format nntppost...
176903,Article-I.D.: optilink.15236\n\nIn article <19...,070744ece06d7ca7a8f060d9003464d2,in article craig depken writes in article stev...,article craig depken writes article steve hend...
61099,"\n\nActually, the ""ether"" stuff sounded a fair...",b3367244de18f801d0bc479342104586,actually the ether stuff sounded a fair bit li...,actually ether stuff sounded fair bit like biz...
178320,"\n\n\n/* Written 8:33 pm Apr 14, 1993 by nln...",b2010ed0063e95567f3c03e98c84c944,written 833 pm apr 14 1993 by in igcnlnsnews n...,written 833 pm apr 14 1993 igcnlnsnews nlns pa...
103101,\nFrom article <1993Apr16.162950.25849@newsgat...,e2c2557c00ecf04566b7ba0a11b7a1fa,from article by mark monninger this kind of be...,article mark monninger kind behavior shocked e...


Calculate embeddings for both datasets. all-MiniLM-L6-v2 is a lightweight language model that delivers fast and high-quality results for semantic similarity.

When using embeddings it is better not lemmatize and eliminate stopwords because these words can influence the meaning of a sentence, and embedding models are desgined to capture it. Using TF-IDF it would be more helpful.

In [15]:
model_name='all-MiniLM-L6-v2'
corpus['embeddings']=precompute_embeddings(corpus, 'cleaned_text_basic', model_name)
test_data['embeddings']=precompute_embeddings(test_data, 'cleaned_text_basic', model_name)

In [16]:
corpus.columns

Index(['Text', 'resource_id', 'cleaned_text_basic',
       'cleaned_text_lematization', 'embeddings'],
      dtype='object')

Using embeddings, we calculate the similarity between each entry in the test data and all documents in the corpus.

Our function then retrieves the resource IDs of the 5 most similar documents from the corpus, along with the mean similarity score and a dynamic threshold based on the 90th percentile. Depending on the stakeholders' requirements, the output can be customized to prioritize specific metrics.

In [17]:
results = find_similarity_between_datasets(corpus_df=corpus, test_df=test_data, resource_id_col='resource_id', embeddings_col='embeddings', percentile=90)
for test_id, details in results.items():
    print(f"Test ID: {test_id}")
    print(f"  Top 5 Similar IDs: {details['Top 5 Similar IDs']}")
    print(f"  Mean Similarity: {details['Mean Similarity']:.2f}")
    print(f"  Dynamic Threshold: {details['Dynamic Threshold']:.2f}\n")

Test ID: b16033e1c9ad7aeed5ef47241a004318
  Top 5 Similar IDs: [('6806875264a333474300ccf601bc7bfc', 0.53), ('3db7becc2440aa5bf59403d4e11a77c8', 0.44), ('1059e6bb76b01555d9ef668f5364e927', 0.39), ('4a87d03435544062f763211a7dd49e7f', 0.39), ('38f46797cbb60652a29186a34e621741', 0.39)]
  Mean Similarity: 0.12
  Dynamic Threshold: 0.22

Test ID: 879e28912138a41aa9c4ee035af7e2f1
  Top 5 Similar IDs: [('2ec743cbe8791a468303798c82d5590c', 0.52), ('f41f14dddfb27397246b569b61ce0c29', 0.48), ('3104ccca2874fd22f75a5590901e8915', 0.45), ('88b337967bc975462481060388f12a49', 0.42), ('87b419dabe83965b86a256353c3bf5d7', 0.41)]
  Mean Similarity: 0.12
  Dynamic Threshold: 0.24

Test ID: c6d389b83f7d0d2f7a12be62e3d1e974
  Top 5 Similar IDs: [('4a87d03435544062f763211a7dd49e7f', 0.83), ('62df5d58bacd62a52cbcbe9b4d4b346f', 0.54), ('62e0585b165ee1f618ab41bad7fa3e11', 0.51), ('feee66000a41eb54a66465862dbd9b27', 0.46), ('00fb9b97af3c156573c2214b6d2d6461', 0.46)]
  Mean Similarity: 0.11
  Dynamic Threshold: 0

In [21]:
corpus.loc[corpus['resource_id'] == '6806875264a333474300ccf601bc7bfc', 'cleaned_text_basic'].values[0]

'can somebody elaborate on area ruling i gather its something to do with aerodynamics of transsonic planes and can be summarised as coke bottle good coke can bad anyone provide more details derivation etc gregory bond burdett buckeridge young ltd melbourne australia knoxs 386 is slick fox in sox on knoxs box knoxs box is very quick plays lots of lsl hes sick apologies to john iron bar mackin'

In [20]:
test_data.loc[test_data['resource_id'] == 'b16033e1c9ad7aeed5ef47241a004318', 'cleaned_text_basic'].values[0]

'in article henry spencer writes in article pat writes i thought the area rule was pioneered by boeing nasa guys developed the rule but noone knew if it worked until boeing built the hardware 727 and maybe the fb111 nope the decisive triumph of the area rule was when convairs yf102 contractually commmitted to being a mach 15 fighter and actually found to be incapable of going supersonic in level flight was turned into the arearuled yf102a which met the specs this was well before either the 727 or the fb111 the 102 flew in late 1953 and convair spent most of the first half of 1954 figuring out what went wrong and most of the second half building the first 102a all work is one mans work henry spencer u of toronto zoology kipling utzoohenry good thing i stuck in a couple of question marks up there i seem to recall somebody built or at least proposed a wasp waisetd passenger civil transport i thought it was a 727 but maybe it was a dc 89 sure it had a funny passenger compartment but on the