In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time

In [None]:
# if you do not have 'nltk', the following command should work "python -m pip install nltk"
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
tweets_path = '/content/drive/MyDrive/RIAW/data/farmers-protest-tweets.json'
tweet_doc_ids_map_path = '/content/drive/MyDrive/RIAW/data/tweet_document_ids_map.csv'
evaluation_path = '/content/drive/MyDrive/RIAW/data/evaluation.csv'

In [None]:
import json
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


# Load the JSON dataset line by line
tweets_data = []
with open(tweets_path, 'r', encoding='utf-8') as f:
    for line in f:
        # Each line is a separate JSON object
        tweets_data.append(json.loads(line))


# Load the tweet-to-document ID map
tweet_doc_ids_map = pd.read_csv(tweet_doc_ids_map_path)

# Convert the mapping into a dictionary for fast lookups (tweet_id -> document_id)
tweet_to_doc_map = dict(zip(tweet_doc_ids_map['id'], tweet_doc_ids_map['docId']))

# Function to preprocess tweet content
def build_terms(line):
    """
    Preprocess the tweet content by removing stop words, punctuation, and applying stemming.

    Argument:
    line -- string (text to preprocess)

    Returns:
    A list of tokens corresponding to the preprocessed text.
    """
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    # Convert to lowercase
    line = line.lower()

    # Remove punctuation (except for #)
    line = line.translate(str.maketrans('', '', string.punctuation.replace('#', '')))

    # Tokenize the text
    line = line.split()

    # Remove stopwords
    line = [word for word in line if word not in stop_words]

    # Apply stemming
    line = [stemmer.stem(word) for word in line]

    return line

# Function to extract hashtags from the tweet
def extract_hashtags(text):
    """
    Extracts hashtags from the tweet content.

    Argument:
    text -- string (tweet content)

    Returns:
    hashtags - a list of hashtags found in the tweet
    """
    hashtags = re.findall(r'#\w+', text)  # Find hashtags using regular expressions
    return hashtags

# Function to process a tweet and return the required structure
def process_tweet(tweet):
    """
    Processes a single tweet and returns the required information structure, including document ID.

    Argument:
    tweet -- dictionary containing tweet data

    Returns:
    processed_tweet -- dictionary with the tweet's preprocessed content and metadata
    """
    tweet_id = tweet['id']  # Get the tweet ID

    # Fetch the document ID mapped to this tweet ID
    document_id = tweet_to_doc_map.get(tweet_id, None)

    tweet_content = tweet['content']  # Get the tweet content

    # Preprocess the tweet content
    preprocessed_content = build_terms(tweet_content)

    # Extract hashtags
    hashtags = extract_hashtags(tweet_content)

    # Structure the final output
    processed_tweet = {
        'Tweet': ' '.join(preprocessed_content),  # Preprocessed tweet content
        'Date': tweet['date'],
        'Hashtags': hashtags,
        'Likes': tweet['likeCount'],
        'Retweets': tweet['retweetCount'],
        'Url': tweet['url']
    }

    return document_id, processed_tweet

# Dictionary to store the results
doc_tweet_map = {}

# Process all tweets and store them in a dictionary of dictionaries
for tweet in tweets_data:
    document_id, processed_tweet = process_tweet(tweet)

    if document_id:  # Ensure the tweet has a valid document ID
        # Store the processed tweet under the document ID
        doc_tweet_map[document_id] = processed_tweet

# Example: Print the dictionary for the first few document IDs
for doc_id, tweet_info in list(doc_tweet_map.items())[:5]:
    print(f"Doc ID: {doc_id}\nTweet Info: {tweet_info}\n")


Doc ID: doc_0
Tweet Info: {'Tweet': 'world progress indian polic govt still tri take india back horrif past tyranni narendramodi delhipolic shame #modidontsellfarm #farmersprotest #freenodeepkaur httpstcoes3kn0iqaf', 'Date': '2021-02-24T09:23:35+00:00', 'Hashtags': ['#ModiDontSellFarmers', '#FarmersProtest', '#FreeNodeepKaur'], 'Likes': 0, 'Retweets': 0, 'Url': 'https://twitter.com/ArjunSinghPanam/status/1364506249291784198'}

Doc ID: doc_1
Tweet Info: {'Tweet': '#farmersprotest #modiignoringfarmersdeath #modidontsellfarm kisanektamorcha farmer constantli distroy crop throughout india realli heart breakingw care crop like children govt agricultur minist laugh us🚜🌾w win💪 httpstcoklspngg9x', 'Date': '2021-02-24T09:23:32+00:00', 'Hashtags': ['#FarmersProtest', '#ModiIgnoringFarmersDeaths', '#ModiDontSellFarmers'], 'Likes': 0, 'Retweets': 0, 'Url': 'https://twitter.com/PrdeepNain/status/1364506237451313155'}

Doc ID: doc_2
Tweet Info: {'Tweet': 'reallyswara rohinisgh watch full video https

In [None]:
def create_inverted_index(tweets_data, tweet_to_doc_map):
    """
    Creates an inverted index for the protest tweets dataset.

    Arguments:
    tweets_data -- list of tweet dictionaries with fields like 'id' and 'content'
    tweet_to_doc_map -- dictionary mapping tweet IDs to document IDs

    Returns:
    inverted_index -- dictionary with terms as keys and posting lists as values.
                      Each posting list contains tuples of (document ID, positions).
    """
    inverted_index = defaultdict(list)

    # Loop through each tweet
    for tweet in tweets_data:
        tweet_id = tweet['id']
        document_id = tweet_to_doc_map.get(tweet_id, None)

        # Skip if the document ID is not found
        if not document_id:
            continue

        # Preprocess the tweet content to get terms
        terms = build_terms(tweet['content'])

        # Create a dictionary to track term positions in the current tweet
        current_tweet_index = defaultdict(lambda: (document_id, array('I')))

        # Loop through the terms and store their positions
        for position, term in enumerate(terms):
            current_tweet_index[term][1].append(position)

        # Merge the current tweet index with the main inverted index
        for term, posting in current_tweet_index.items():
            inverted_index[term].append(posting)

    return inverted_index

# Creating the inverted index
inverted_index = create_inverted_index(tweets_data, tweet_to_doc_map)

# Check a few entries in the inverted index
for term, postings in list(inverted_index.items())[:5]:
    print(f"Term: {term}")
    for doc_id, positions in postings:
        print(f" - Document ID: {doc_id}, Positions: {list(positions)}")

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
 - Document ID: doc_20561, Positions: [14]
 - Document ID: doc_20623, Positions: [1]
 - Document ID: doc_20674, Positions: [8]
 - Document ID: doc_20708, Positions: [4]
 - Document ID: doc_20747, Positions: [0]
 - Document ID: doc_20778, Positions: [2]
 - Document ID: doc_20789, Positions: [5]
 - Document ID: doc_20818, Positions: [0]
 - Document ID: doc_20832, Positions: [6]
 - Document ID: doc_20851, Positions: [17]
 - Document ID: doc_20855, Positions: [3]
 - Document ID: doc_20864, Positions: [15]
 - Document ID: doc_20992, Positions: [1]
 - Document ID: doc_21006, Positions: [15, 19]
 - Document ID: doc_21022, Positions: [7]
 - Document ID: doc_21063, Positions: [9]
 - Document ID: doc_21075, Positions: [3]
 - Document ID: doc_21101, Positions: [4]
 - Document ID: doc_21127, Positions: [9]
 - Document ID: doc_21153, Positions: [9]
 - Document ID: doc_21165, Positions: [0]
 - Document ID: doc_21228, Position

In [None]:
from collections import defaultdict

# Function to compute the TF-IDF score for a term in a specific document
def compute_tf_idf(inverted_index, total_docs, term, doc_id):
    """
    Compute the TF-IDF score for a term in a specific document.

    Arguments:
    inverted_index -- the inverted index with terms as keys and posting lists as values
    total_docs -- total number of documents in the dataset
    term -- the term for which TF-IDF is being computed
    doc_id -- the ID of the document for which TF-IDF is being computed

    Returns:
    tf_idf -- the TF-IDF score for the term in the document
    """
    # Calculate Term Frequency (TF)
    term_occurrences = [posting for posting in inverted_index[term] if posting[0] == doc_id]
    if not term_occurrences:
        return 0
    tf = len(term_occurrences[0][1])  # Number of times the term appears in the document

    # Calculate Document Frequency (DF)
    df = len(inverted_index[term])  # Number of documents containing the term

    # Calculate TF-IDF using the formula TF * IDF
    idf = math.log(total_docs / (df + 1))  # Adding 1 to prevent division by zero
    tf_idf = tf * idf

    return tf_idf

# Function to build TF-IDF vectors for documents based on query terms
def build_document_vectors(inverted_index, query_terms, total_docs):
    """
    Build TF-IDF vectors for each document containing the query terms.

    Arguments:
    inverted_index -- the inverted index with terms as keys and posting lists as values
    query_terms -- list of terms in the query
    total_docs -- total number of documents in the dataset

    Returns:
    doc_vectors -- dictionary with document IDs as keys and TF-IDF vectors as values
    """
    doc_vectors = defaultdict(lambda: np.zeros(len(query_terms)))
    for i, term in enumerate(query_terms):
        if term in inverted_index:
            for doc_id, _ in inverted_index[term]:
                tf_idf_score = compute_tf_idf(inverted_index, total_docs, term, doc_id)
                doc_vectors[doc_id][i] = tf_idf_score
    return doc_vectors

# Function to build a TF-IDF vector for the query
def build_query_vector(inverted_index, query_terms, total_docs):
    """
    Build a TF-IDF vector for the query based on document statistics.

    Arguments:
    inverted_index -- the inverted index with terms as keys and posting lists as values
    query_terms -- list of terms in the query
    total_docs -- total number of documents in the dataset

    Returns:
    query_vector -- TF-IDF vector for the query
    """
    query_vector = np.zeros(len(query_terms))
    for i, term in enumerate(query_terms):
        # Calculate IDF for each term based on the actual document frequency in the inverted index
        if term in inverted_index:
            df = len(inverted_index[term])  # Document frequency for the term
            idf = math.log(total_docs / (df + 1))  # Adding 1 to prevent division by zero
            query_vector[i] = idf
    return query_vector

# Function to calculate cosine similarity between the query vector and each document vector
def cosine_similarity(query_vector, doc_vector):
    """
    Calculate cosine similarity between two vectors.

    Arguments:
    query_vector -- TF-IDF vector for the query
    doc_vector -- TF-IDF vector for a document

    Returns:
    similarity -- cosine similarity score between query and document vectors
    """
    # Calculate dot product between query and document vectors
    dot_product = np.dot(query_vector, doc_vector)

    # Calculate the magnitude of the query and document vectors
    query_magnitude = np.linalg.norm(query_vector)
    doc_magnitude = np.linalg.norm(doc_vector)

    # Compute cosine similarity (avoid division by zero)
    if query_magnitude == 0 or doc_magnitude == 0:
        return 0
    return dot_product / (query_magnitude * doc_magnitude)

# Function to rank documents based on cosine similarity with the query
def rank_documents_tf_idf_cosine(inverted_index, query_terms, total_docs):
    """
    Rank documents based on TF-IDF + Cosine Similarity.

    Arguments:
    inverted_index -- the inverted index with terms as keys and posting lists as values
    query_terms -- list of terms in the query
    total_docs -- total number of documents in the dataset

    Returns:
    ranked_docs -- list of documents sorted by their cosine similarity scores with the query
    """
    # Build TF-IDF vectors for documents
    doc_vectors = build_document_vectors(inverted_index, query_terms, total_docs)

    # Build the TF-IDF vector for the query
    query_vector = build_query_vector(inverted_index, query_terms, total_docs)

    # Calculate cosine similarity for each document and store the scores
    doc_scores = {}
    for doc_id, doc_vector in doc_vectors.items():
        similarity = cosine_similarity(query_vector, doc_vector)
        doc_scores[doc_id] = similarity

    # Sort documents by similarity score in descending order
    ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_docs

# Query: "What is the Indian government view on the farmers' protest?"
query = "What is the Indian government view on the farmers' protest?"
query_processed = build_terms(query)
total_docs = len(tweet_to_doc_map)  # Total number of documents in the dataset
ranked_docs = rank_documents_tf_idf_cosine(inverted_index, query_processed, total_docs)

# Display the top 5 ranked documents
print("Top 20 Ranked Documents by TF-IDF + Cosine Similarity:")
for doc_id, score in ranked_docs[:20]:
    print(f"Document ID: {doc_id}, Cosine Similarity Score: {score}")


Top 20 Ranked Documents by TF-IDF + Cosine Similarity:
Document ID: doc_1273, Cosine Similarity Score: 0.9321757179559246
Document ID: doc_11265, Cosine Similarity Score: 0.9321757179559246
Document ID: doc_16057, Cosine Similarity Score: 0.9321757179559246
Document ID: doc_28464, Cosine Similarity Score: 0.9321757179559246
Document ID: doc_20704, Cosine Similarity Score: 0.9262099832481728
Document ID: doc_17156, Cosine Similarity Score: 0.8883695631933285
Document ID: doc_21006, Cosine Similarity Score: 0.8883695631933285
Document ID: doc_36129, Cosine Similarity Score: 0.8703493953287428
Document ID: doc_18896, Cosine Similarity Score: 0.8653753366684499
Document ID: doc_10548, Cosine Similarity Score: 0.8525353378102423
Document ID: doc_13939, Cosine Similarity Score: 0.8525353378102423
Document ID: doc_35711, Cosine Similarity Score: 0.8525353378102423
Document ID: doc_3210, Cosine Similarity Score: 0.8474567278511329
Document ID: doc_17899, Cosine Similarity Score: 0.847456727851

In [None]:
# Constants for BM25
k1 = 1.5  # Tuning parameter for term frequency scaling
b = 0.75  # Tuning parameter for document length scaling

# Function to compute the average document length in the collection
def compute_avg_doc_length(doc_lengths):
    return sum(doc_lengths.values()) / len(doc_lengths)

# Function to calculate the BM25 score for a term in a specific document
def bm25_term_score(N, df, tf, doc_len, avg_doc_len):
    """
    Compute the BM25 score for a term in a specific document.

    Arguments:
    N -- total number of documents
    df -- document frequency of the term
    tf -- term frequency of the term in the document
    doc_len -- length of the document
    avg_doc_len -- average document length in the collection

    Returns:
    score -- the BM25 score for the term in the document
    """
    # Calculate IDF component
    idf = math.log((N - df + 0.5) / (df + 0.5) + 1)

    # Calculate BM25 term frequency component
    term_freq_component = ((k1 + 1) * tf) / (k1 * ((1 - b) + b * (doc_len / avg_doc_len)) + tf)

    # Final BM25 score for the term in the document
    return idf * term_freq_component

# Function to rank documents based on BM25 score for a query
def rank_documents_bm25(inverted_index, query_terms, doc_lengths, total_docs):
    """
    Rank documents based on BM25 score for the given query terms.

    Arguments:
    inverted_index -- the inverted index with terms as keys and posting lists as values
    query_terms -- list of terms in the query
    doc_lengths -- dictionary with document IDs as keys and their lengths as values
    total_docs -- total number of documents in the dataset

    Returns:
    ranked_docs -- list of documents sorted by their cumulative BM25 scores (in descending order)
    """
    avg_doc_len = compute_avg_doc_length(doc_lengths)
    doc_scores = defaultdict(float)

    # Calculate BM25 score for each document with respect to each query term
    for term in query_terms:
        if term not in inverted_index:
            continue

        df = len(inverted_index[term])  # Document frequency of the term

        for doc_id, positions in inverted_index[term]:
            tf = len(positions)  # Term frequency of the term in the document
            doc_len = doc_lengths[doc_id]  # Length of the document

            # Calculate BM25 score for the term in the document
            term_score = bm25_term_score(total_docs, df, tf, doc_len, avg_doc_len)
            doc_scores[doc_id] += term_score  # Accumulate BM25 scores for each document

    # Sort documents by BM25 score in descending order
    ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_docs

doc_lengths = {doc_id: len(tweet_info['Tweet'].split()) for doc_id, tweet_info in doc_tweet_map.items()}
total_docs = len(doc_tweet_map)

ranked_docs = rank_documents_bm25(inverted_index, query_processed, doc_lengths, total_docs)

# Display the top 5 ranked documents
print("Top 20 Ranked Documents by BM25:")
for doc_id, score in ranked_docs[:20]:
    print(f"Document ID: {doc_id}, BM25 Score: {score}")


Top 20 Ranked Documents by BM25:
Document ID: doc_17156, BM25 Score: 13.636151512312594
Document ID: doc_30422, BM25 Score: 13.168913200225282
Document ID: doc_37712, BM25 Score: 10.871392391317688
Document ID: doc_10923, BM25 Score: 10.79965400759432
Document ID: doc_21006, BM25 Score: 10.359179922767837
Document ID: doc_20704, BM25 Score: 10.358118906529452
Document ID: doc_37241, BM25 Score: 10.333603630528476
Document ID: doc_46909, BM25 Score: 10.328318411051908
Document ID: doc_37560, BM25 Score: 10.177080587394158
Document ID: doc_37209, BM25 Score: 10.169551073889046
Document ID: doc_37219, BM25 Score: 10.169551073889046
Document ID: doc_47837, BM25 Score: 10.088309541241681
Document ID: doc_47839, BM25 Score: 10.088309541241681
Document ID: doc_47841, BM25 Score: 10.088309541241681
Document ID: doc_47843, BM25 Score: 10.088309541241681
Document ID: doc_47845, BM25 Score: 10.088309541241681
Document ID: doc_47848, BM25 Score: 10.088309541241681
Document ID: doc_48027, BM25 Scor

In [None]:
# Define the P-Score calculation function
def calculate_p_score(likes, retweets, followers,
                      alpha=0.5, beta=1.0, delta=0.8):
    """
    Calculate the P-Score for a tweet based on social media engagement metrics.

    Parameters:
    - likes (int): Number of likes the tweet received.
    - retweets (int): Number of retweets the tweet received.
    - followers (int): Number of followers of the tweet's author.
    - alpha (float): Weight for likes. Default is 0.5.
    - beta (float): Weight for retweets. Default is 1.0.
    - delta (float): Weight for engagement ratio. Default is 0.8.

    Returns:
    - float: The computed P-Score for the tweet.
    """
    total_interactions = likes + retweets
    engagement_ratio = total_interactions / followers if followers > 0 else 0
    p_score = (alpha * likes) + (beta * retweets) + (delta * engagement_ratio)
    return p_score

# Calculate and rank documents by P-Score
def rank_documents_by_p_score(doc_tweet_map):
    """
    Calculate P-Scores for all tweets and rank the documents based on these scores.

    Parameters:
    - doc_tweet_map (dict): Dictionary where keys are document IDs and values are dictionaries containing tweet info

    Returns:
    - ranked_docs (list): List of tuples (document ID, P-Score) sorted by P-Score in descending order
    """
    p_scores = {}

    for doc_id, tweet_info in doc_tweet_map.items():
        # Extract engagement metrics from tweet_info
        likes = tweet_info.get('Likes', 0)
        retweets = tweet_info.get('Retweets', 0)
        followers = tweet_info.get('Followers', 1)  # Defaulting to 1 to avoid division by zero if followers are missing

        # Calculate the P-Score for this tweet
        p_score = calculate_p_score(likes, retweets, followers)

        # Store the P-Score for this document
        p_scores[doc_id] = p_score

    # Sort documents by P-Score in descending order
    ranked_docs = sorted(p_scores.items(), key=lambda x: x[1], reverse=True)

    return ranked_docs

# Rank documents by P-Score
ranked_docs = rank_documents_by_p_score(doc_tweet_map)

# Display the top-ranked documents based on P-Score
print("Top-ranked documents by P-Score:")
for doc_id, score in ranked_docs[:20]:  # Display top 5 results
    print(f"Document ID: {doc_id}, P-Score: {score}")


Top-ranked documents by P-Score:
Document ID: doc_3203, P-Score: 47349.600000000006
Document ID: doc_46206, P-Score: 28520.100000000002
Document ID: doc_45142, P-Score: 22561.300000000003
Document ID: doc_38012, P-Score: 20259.5
Document ID: doc_38410, P-Score: 20077.6
Document ID: doc_38262, P-Score: 19929.300000000003
Document ID: doc_35993, P-Score: 17205.0
Document ID: doc_27071, P-Score: 15231.2
Document ID: doc_9846, P-Score: 13910.6
Document ID: doc_38379, P-Score: 12391.0
Document ID: doc_41472, P-Score: 11900.8
Document ID: doc_28154, P-Score: 11785.5
Document ID: doc_18306, P-Score: 11322.7
Document ID: doc_23286, P-Score: 10439.8
Document ID: doc_27974, P-Score: 9859.5
Document ID: doc_31312, P-Score: 9262.6
Document ID: doc_46278, P-Score: 9015.8
Document ID: doc_13132, P-Score: 8902.400000000001
Document ID: doc_28056, P-Score: 8575.8
Document ID: doc_16447, P-Score: 8402.5


In [None]:
import gensim.downloader as api
from gensim.models import KeyedVectors, Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# Load the word2vec model
model = api.load("word2vec-google-news-300")

# Function to compute the average Word2Vec vector for a document
def average_word2vec_vector(text, model, vector_size=300):
    """Compute the average Word2Vec vector for a given text."""
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

# Create document vectors for all tweets in the dataset
doc_vectors = {}
for doc_id, tweet_info in doc_tweet_map.items():
    tweet_text = tweet_info['Tweet']
    doc_vectors[doc_id] = average_word2vec_vector(tweet_text, model)

# Define multi-word queries
multi_word_queries = [
    "farmers protest",
    "indian government",
    "support farmers",
    "police action",
    "human rights"
]

# Preprocess and compute the query vectors
query_vectors = {}
for i, query in enumerate(multi_word_queries, 1):
    processed_query = ' '.join(build_terms(query))
    query_vectors[i] = average_word2vec_vector(processed_query, model)

# Rank documents for each query based on cosine similarity
def rank_documents_word2vec_cosine(query_vectors, doc_vectors):
    """
    Rank documents for each query using cosine similarity with Word2Vec vectors.
    Returns top 20 documents for each query.
    """
    ranked_results = {}

    for query_id, query_vector in query_vectors.items():
        doc_scores = {}

        # Calculate cosine similarity between the query vector and each document vector
        for doc_id, doc_vector in doc_vectors.items():
            if np.linalg.norm(doc_vector) != 0 and np.linalg.norm(query_vector) != 0:  # Avoid zero vectors
                similarity = cosine_similarity([query_vector], [doc_vector])[0][0]
                doc_scores[doc_id] = similarity

        # Sort documents by similarity score in descending order and take top 20
        ranked_results[query_id] = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:20]

    return ranked_results

# Run the ranking for each query
ranked_docs = rank_documents_word2vec_cosine(query_vectors, doc_vectors)

# Display the top 20 ranked documents for each query
for query_id, results in ranked_docs.items():
    print(f"\nTop 20 Ranked Documents for Query '{multi_word_queries[query_id - 1]}':")
    for doc_id, score in results:
        print(f"Document ID: {doc_id}, Cosine Similarity Score: {score}")


Top 20 Ranked Documents for Query 'farmers protest':
Document ID: doc_3521, Cosine Similarity Score: 1.0
Document ID: doc_4681, Cosine Similarity Score: 1.0
Document ID: doc_5727, Cosine Similarity Score: 1.0
Document ID: doc_5973, Cosine Similarity Score: 1.0
Document ID: doc_7295, Cosine Similarity Score: 1.0
Document ID: doc_7433, Cosine Similarity Score: 1.0
Document ID: doc_7438, Cosine Similarity Score: 1.0
Document ID: doc_11824, Cosine Similarity Score: 1.0
Document ID: doc_12388, Cosine Similarity Score: 1.0
Document ID: doc_14759, Cosine Similarity Score: 1.0
Document ID: doc_16557, Cosine Similarity Score: 1.0
Document ID: doc_20024, Cosine Similarity Score: 1.0
Document ID: doc_20877, Cosine Similarity Score: 1.0
Document ID: doc_21162, Cosine Similarity Score: 1.0
Document ID: doc_21259, Cosine Similarity Score: 1.0
Document ID: doc_21323, Cosine Similarity Score: 1.0
Document ID: doc_21436, Cosine Similarity Score: 1.0
Document ID: doc_21450, Cosine Similarity Score: 1.0