<a href="https://colab.research.google.com/github/pyaguega/LDA-document-ranking/blob/main/BERTGamma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install transformers



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [5]:
def read_files_from_directory(directory_path):
    files_content = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                files_content.append(content)
    return files_content

In [6]:
def calculate_embeddings(texts, model, tokenizer):
    embeddings = []
    for text in texts:
        input_ids = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=512)
        input_ids = torch.tensor([input_ids])

        with torch.no_grad():
            outputs = model(input_ids)

        last_hidden_states = outputs.last_hidden_state
        sentence_embedding = torch.mean(last_hidden_states, dim=1).squeeze()
        embeddings.append(sentence_embedding.numpy())

    return embeddings

In [7]:
def save_scores_to_csv(scores, filename='similarity_scores.csv'):
    with open(filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Query', 'Document Index', 'Similarity Score', 'Above Thershold'])
        for doc_type, index, score in scores:
          score = float(score)
          above_threshold = 'Yes' if score >= threshold_value else 'No'
          csvwriter.writerow([doc_type, index, score, above_threshold])

In [8]:
def save_scores_to_csv(scores, threshold_value, base_filename='similarity_scores'):
    filename = f"{base_filename}_threshold_{threshold_value:.4f}.csv"
    with open(filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Document Type', 'Document Index', 'Similarity Score', 'Above Thershold'])
        for score_info in scores:
            csvwriter.writerow(score_info)

In [9]:
def main():

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModel.from_pretrained("bert-base-uncased")


    chromatographic_data_dir = "/content/drive/My Drive/Chrom"
    non_chromatographic_data_dir = "/content/drive/My Drive/NChrom"


    chromatographic_docs = read_files_from_directory(chromatographic_data_dir)
    non_chromatographic_docs = read_files_from_directory(non_chromatographic_data_dir)


    all_docs = chromatographic_docs + non_chromatographic_docs
    doc_embeddings = calculate_embeddings(all_docs, model, tokenizer)


    query = "Upanshu"
    query_embedding = calculate_embeddings([query], model, tokenizer)[0]
    similarity_scores = cosine_similarity([query_embedding], doc_embeddings)[0]

    mean_score = np.mean(similarity_scores)
    std_dev_score = np.std(similarity_scores)

    threshold_value = mean_score + std_dev_score

    threshold_value = float(threshold_value)

    filtered_scores = []
    for i, score in enumerate(similarity_scores):
        doc_type = 'Chromatographic' if i < len(chromatographic_docs) else 'Non-Chromatographic'
        score = float(score)
        filtered_scores.append((doc_type, i+1, score))

    #chromatographic_scores = similarity_scores[:len(chromatographic_docs)]
    #non_chromatographic_scores = similarity_scores[len(chromatographic_docs):]

    #sorted_chromatographic_scores = sorted(
     #   [(score, i) for i, score in enumerate(chromatographic_scores, start=1)],
      #  key=lambda x: x[0],
      #  reverse=True
    #)

    #sorted_non_chromatographic_scores = sorted(
     #   [(score, i) for i, score in enumerate(non_chromatographic_scores, start=1)],
      #  key=lambda x: x[0],
      #  reverse=True
    #)

    #print("Similarity scores for Chromatographic Docs:")
    #for score, i in sorted_chromatographic_scores:
        #print(f"Document {i} Similarity: {score}")

    #print("\nSimilarity scores for Non-Chromatographic Docs:")
    #or score, i in sorted_non_chromatographic_scores:
        #print(f"Document {i} Similarity: {score}")

    #with open('cosine_similarity_scores.csv', 'w', newline='') as file:
        #writer = csv.writer(file)
        #writer.writerow(['Score', 'Type'])

        #for score in sorted_chromatographic_scores:
            #writer.writerow([score, 'Chromatographic'])

        #for score in sorted_non_chromatographic_scores:
            #writer.writerow([score, 'Non-Chromatographic'])

    #data = pd.read_csv('cosine_similarity_scores.csv')

    #plt.figure(figsize=(10, 6))
    #sns.histplot(data, x='Score', hue='Type', kde=True, element="step")

    #plt.title('Distribution of Cosine Similarity Scores')
    #plt.xlabel('Cosine Similarity Score')
    #plt.ylabel('Frequency')
    #plt.legend(title='Data Type')
    #plt.grid(True)

    #plt.show()
    print(f"Threshold value for '{query}': {threshold_value}")
    save_scores_to_csv(filtered_scores, threshold_value)
    #save_scores_to_csv(chromatographic_scores, non_chromatographic_scores)

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

FileNotFoundError: ignored

In [None]:
#def save_scores_to_csv(chromatographic_scores, non_chromatographic_scores):
#    with open('similarity_scores.csv', 'w', newline='') as csvfile:
#        csvwriter = csv.writer(csvfile)

#       csvwriter.writerow(['Document Type', 'Document Index', 'Similarity Score'])

#        for i, score in enumerate(chromatographic_scores, start=1):
#           csvwriter.writerow(['Chromatographic', i, score])

#        for i, score in enumerate(non_chromatographic_scores, start=1):
#           csvwriter.writerow(['Non-Chromatographic', i, score])

In [None]:
def plot_similarity_scores_from_csv(csv_filename):
    df = pd.read_csv(csv_filename)

    #chromatographic_data = df[df['Document Type'] == 'Chromatographic']
    #non_chromatographic_data = df[df['Document Type'] == 'Non-Chromatographic']

    plt.figure(figsize=(10, 6))
    #plt.plot(chromatographic_data['Document Index'], chromatographic_data['Similarity Score'], label='Chromatographic', marker='o')
    #plt.plot(non_chromatographic_data['Document Index'], non_chromatographic_data['Similarity Score'], label='Non-Chromatographic', marker='x')

    for query in df['Query'].unique():
        query_data = df[df['Query'] == query]
        threshold = query_data['Similarity Score'][query_data['Above Threshold']].min()
        plt.scatter(query_data['Document Index'], query_data['Similarity Score'], label=f'{query} Scores', marker='o')
        plt.axhline(y=threshold, color='red', linestyle='--', label=f'{query} Threshold')

    plt.xlabel('Query ~ Merck Document')
    plt.ylabel('Similarity Score')
    plt.title('Chromatographic vs. Non-Chromatographic Documents')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

plot_similarity_scores_from_csv('similarity_scores.csv')