In [1]:
import pandas as pd
import os
from pathlib import Path
import spacy
from sentence_transformers import SentenceTransformer
from spacy.lang.en.stop_words import STOP_WORDS

spacy.cli.download("en_core_web_sm")

# Smaller BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.add_pipe("sentencizer")

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting spacy<3.8.0,>=3.7.2 (from en-core-web-sm==3.7.1)
  Using cached spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Using cached spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Installing collected packages: spacy, en-core-web-sm
  Attempting uninstall: spacy
    Found existing installation: spacy 3.7.1
    Uninstalling spacy-3.7.1:
      Successfully uninstalled spacy-3.7.1
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.7.0
    Uninstalling en-

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
nudger 0.0.8 requires pandas<2.0.0,>=1.5.3, but you have pandas 2.1.3 which is incompatible.[0m[31m
[0m

Successfully installed en-core-web-sm-3.7.1 spacy-3.7.2



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')




<spacy.pipeline.sentencizer.Sentencizer at 0x7eff3b73c780>

In [2]:
DATA_DIR = Path.cwd() / "data"

DF_DIR = Path.cwd() / "saved_dfs"
DF_DIR.mkdir(exist_ok=True)

df_path = DF_DIR / "dataframe.csv"

# Conditional that checks whether we saved the dfs as csv files in prior run.
# If yes, then reinitialise these csvs as dfs.
# If not, then create the dfs and save them in csv format for next run.
if df_path.exists():
    print("Loading dataset from CSV...")
    df = pd.read_csv(df_path)

else:
    data = []

    for filename in os.listdir(DATA_DIR):
        if filename.endswith(".txt"):
            print(filename)

            file_path = DATA_DIR / filename
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()

            # Create a mapping of character positions to line numbers
            line_starts = {0: 1}
            for i, char in enumerate(content):
                if char == '\n':
                    line_starts[i + 1] = line_starts[i] + 1
                else:
                    line_starts[i + 1] = line_starts[i]

            # Process the entire content with spaCy
            doc = nlp(content)
            for sent in doc.sents:
                start_char = sent.start_char
                line_number = line_starts[start_char]
                sentence = sent.text.strip()
                data.append({
                    "filename": filename,
                    "sentence": sentence,
                    "line_number": line_number
                })


    # Convert the list of sentence/filename dictionaries into a dataframe 
    df = pd.DataFrame(data)
    df.to_csv(df_path, index=False)

print(df.head())

Loading dataset from CSV...
                    filename  \
0  Gum_And_Tooth_Disease.txt   
1  Gum_And_Tooth_Disease.txt   
2  Gum_And_Tooth_Disease.txt   
3  Gum_And_Tooth_Disease.txt   
4  Gum_And_Tooth_Disease.txt   

                                            sentence  line_number  
0  I am thinking of going back on your [Primal] d...            1  
1  However could you let me know what are the\nef...            2  
2  The reason I ask is\nbecause I have gum diseas...            3  
3  The strapline to his video was "a raw meat die...            6  
4  He cited\nthat all of the people he had met on...            6  


In [3]:
import joblib

EMBEDDING_DIR = Path.cwd() / "embeddings"
EMBEDDING_DIR.mkdir(exist_ok=True)

embeddings_path = EMBEDDING_DIR / f'sentence_embeddings.joblib'

# Conditional to check whether our embeddings joblib already exists from prior runs.
if not embeddings_path.exists():
    print("Generating embeddings for the dataset...")
    embeddings = model.encode(df['sentence'].tolist(), show_progress_bar=True)
    joblib.dump(embeddings, embeddings_path)
else:
    print("Loading embeddings from file...")
    embeddings = joblib.load(embeddings_path)

Loading embeddings from file...


In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(query, embeddings, df):
    # Encode the query string using the model to get its embedding.
    query_embedding = model.encode([query]) 

    # Calculate the cosine similarity between the query embedding and all embeddings in the dataset.
    # cosine_similarity returns a matrix where each row is the similarity of the query to each document.
    # We take the first row [0] because there's only one query, resulting in a one-dimensional array of similarities.
    similarities = cosine_similarity(query_embedding, embeddings)[0]

    top_indices = np.argsort(similarities)[-20:]

    # Use DataFrame.iloc to select the rows at the given indices (top_indices).
    # This gives us the rows from the dataframe that correspond to the top 20 similarities.
    top_docs = df.iloc[top_indices]

    # Select the corresponding top similarity scores using the indices.
    # This gives us the actual similarity scores of the top 20 matches.
    top_scores = similarities[top_indices]

    # Return the top matching documents and their similarity scores.
    return top_docs[['sentence', 'filename']], top_scores

In [5]:
test_set_columns=["Query", "Result", "Cosine", "Filename"]

def create_test_set(query, embeddings, df, test_set):
    top_docs, top_scores = search(query, embeddings, df)
    
    if top_docs.empty:
        print("No documents found for this query.")
        return test_set

    new_rows = []
    # Use zip to positionally combine and iterate over the df and scores in parallel. 
    # iterrows() is used to return a tuple of (index, Series) from the df.
    for (index, row), score in zip(top_docs.iterrows(), top_scores):
        new_row = {
            "Query": query,
            "Result": row['sentence'],
            "Cosine": score,
            "Filename": row['filename'],
        }
        new_rows.append(new_row)
    
    new_rows_df = pd.DataFrame(new_rows)
    test_set = pd.concat([test_set, new_rows_df], ignore_index=True)
    
    return test_set

In [6]:
def generate_test_set_from_queries(query_file_path, embeddings, df):
    test_set = pd.DataFrame(columns=test_set_columns)
    with open(query_file_path, 'r') as file:
        queries = file.read().splitlines()
    
    for query in queries:
        test_set = create_test_set(query, embeddings, df, test_set)
    
    return test_set

TEST_SET_DIR = Path.cwd() / "test_sets"
QUERIES_DIR = Path.cwd() / "queries"
TEST_SET_DIR.mkdir(exist_ok=True)
QUERIES_DIR.mkdir(exist_ok=True)
test_set_path = TEST_SET_DIR / f"test_set.csv"
query_file_path = QUERIES_DIR / "queries.txt"

test_set = generate_test_set_from_queries(query_file_path, embeddings, df)

test_set.to_csv(test_set_path, index=False)

  test_set = pd.concat([test_set, new_rows_df], ignore_index=True)


In [8]:
test_set, TEST_SET_DIR



(                 Query                                             Result  \
 0    Is salt unhealthy  “The craving for salt is symptomatic of a mine...   
 1    Is salt unhealthy       When people drink salt water, they get sick.   
 2    Is salt unhealthy  So lots of salt, which is cheap and flavorful ...   
 3    Is salt unhealthy  I may not have to write this but, all the abov...   
 4    Is salt unhealthy  But still you're destroying a lot of cells eve...   
 ..                 ...                                                ...   
 175  Why eat high meat  I've got people who only do high meat, only do...   
 176  Why eat high meat  Eating high raw meat supplies the body with na...   
 177  Why eat high meat  can be consistently alleviated by eating HIGH ...   
 178  Why eat high meat  The bacteria-infested meat, called “high meat”...   
 179  Why eat high meat  To facilitate the removal of\ndegenerative tis...   
 
        Cosine                                    Filename  
 

In [7]:
def normalize_sentence(sentence):
    # Need to remove any extra spaces/linebreaks
    # The original df sentences sometimes keep their raw formatting (weird line breaks)
    # This leads to a failure to match the embedding search results to my manually curated results
    sentence = sentence.replace("\n", " ")
    return sentence.strip()

def compute_evaluations(test_set, relevant_results):
    evaluation_data = []

    # My manually selected results (from keyword search)
    # are grouped by a set of possible analogous queries.
    # This loop iterates over each row that contains the same query group.
    # After each row in the matching column has been iterated over, it iterates over the rows for the next query group.
    for query_group in relevant_results['Query'].unique():
        grouped_queries = query_group.split(',')
        # Create a list of the normalised results from the relevant result dataframe (filtered on the query group)
        relevant_set = [normalize_sentence(sentence) for sentence in relevant_results[relevant_results['Query'] == query_group]['Result']]
        
        total_hits = 0
        matching_sentences = []

        # Iterate through the queries in the query group, e.g. ('What is arthritis', 'What is arthritis caused by')
        for query in grouped_queries:
            query = query.strip()
            # Find all the results from the test_set that pertain to the individual query (results from embedding search)
            query_results = test_set[test_set['Query'] == query]['Result'].apply(normalize_sentence)

            # Count the sentences that appear in both my manually collated relevant_results and the embedding search results
            for sentence in query_results:
                if sentence in relevant_set:
                    total_hits += 1
                    matching_sentences.append(sentence)

        evaluation_data.append({
            'Query Group': query_group,
            'Total Hits': total_hits,
            'Matching Sentences': ', '.join(matching_sentences)
        })
    
    eval_df = pd.DataFrame(evaluation_data)
    
    return eval_df

EVAL_DIR = Path.cwd() / "evaluations"
EVAL_DIR.mkdir(exist_ok=True)

relevant_results_path = QUERIES_DIR / "relevant_query_results.csv"
relevant_results = pd.read_csv(relevant_results_path)

test_set_path = TEST_SET_DIR / f"test_set.csv"
test_set = pd.read_csv(test_set_path)

eval_df = compute_evaluations(test_set, relevant_results)
eval_path = EVAL_DIR / f"evaluation.csv"
eval_df.to_csv(eval_path, index=False)

print(eval_df.head())

                                         Query Group  Total Hits  \
0              Is salt unhealthy, Salt damages cells           0   
1                     What are signs of intelligence           1   
2  What are signs of intelligence, What are signs...           0   
3                        How to gain weight quickly            0   
4     What is arthritis, What is arthritis caused by           2   

                                  Matching Sentences  
0                                                     
1  All hyperactive children have potential genius...  
2                                                     
3                                                     
4  The cooked saturated fat are a problem, but th...  
