In [None]:
pip install elasticsearch
pip install gensim
pip install sklearn

In [6]:
import gensim
from elasticsearch import Elasticsearch
from gensim.models import FastText
from sklearn.datasets import fetch_20newsgroups

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all')

# Preprocess the documents
preprocessed_docs = []
for doc in newsgroups.data:
    # Tokenize the document
    tokens = gensim.utils.simple_preprocess(doc.lower())
    # Remove stop words and stem the tokens
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    # Join the stemmed tokens back into a string
    preprocessed_doc = ' '.join(stemmed_tokens)
    preprocessed_docs.append(preprocessed_doc)

# Train the FastText model
model = FastText(preprocessed_docs, vector_size=300, window=5, min_count=5, workers=4)

# Save the model to a binary file
model.save('model.bin')

# Initialize Elasticsearch client with URL
es = Elasticsearch(['http://localhost:9200'])

# Delete the index if it already exists
index_name = 'my_index'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Create index with appropriate mappings
index_mappings = {
    'mappings': {
        'properties': {
            'text': {
                'type': 'text'
            },
            'vector': {
                'type': 'dense_vector',
                'dims': 300
            }
        }
    }
}
es.indices.create(index=index_name, body=index_mappings)

# Iterate over preprocessed documents and generate vectors
for i, doc in enumerate(preprocessed_docs):
    # Split the preprocessed document into tokens
    tokens = doc.split()
    # Generate the vector for the document by averaging the vectors of its tokens
    vector_sum = 0
    count = 0
    for token in tokens:
        if token in model.wv:
            vector_sum += model.wv[token]
            count += 1
    if count > 0:
        vector = vector_sum / count
        # Store the document and its vector in the Elasticsearch index
        es.index(index=index_name, id=i, body={'text': doc, 'vector': vector.tolist()})

In [2]:
import gensim
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client with URL
es = Elasticsearch(['http://localhost:9200'])

# Load the pre-trained FastText model
model = gensim.models.FastText.load('model.bin')

# Get user query
user_query = input('Enter your query: ')

# Preprocess the user query
tokens = gensim.utils.simple_preprocess(user_query.lower())
stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
preprocessed_query = ' '.join(stemmed_tokens)

# Generate the vector for the user query by averaging the vectors of its tokens
vector_sum = 0
count = 0
for token in preprocessed_query.split():
    if token in model.wv:
        vector_sum += model.wv[token]
        count += 1
if count > 0:
    query_vector = vector_sum / count

    # Search for similar documents using Elasticsearch
    search_body = {
        'query': {
            'script_score': {
                'query': {
                    'match_all': {}
                },
                'script': {
                    'source': 'cosineSimilarity(params.query_vector, "vector") + 1.0',
                    'params': {
                        'query_vector': query_vector.tolist()
                    }
                }
            }
        },
        '_source': {
            'includes': ['text']
        }
    }
    search_results = es.search(index='my_index', body=search_body)['hits']['hits']

    # Print the top 10 most similar documents
    print('Top 10 most similar documents:')
    for i, hit in enumerate(search_results[:10]):
        print(f'{i+1}. {hit["_source"]["text"]}')
else:
    print('No valid tokens in query')

Query vector: [ 1.54352179e-04  8.35887040e-05  3.29365546e-04  7.56893132e-05
 -2.44254097e-05  1.18353482e-05  2.91574070e-05 -1.24124155e-04
  3.81721213e-04  3.33571254e-04  2.02411407e-04  3.98195334e-06
 -1.70753192e-04 -1.60401014e-05  5.46945266e-05  5.21716604e-04
  2.36337393e-04 -1.52272303e-04 -2.55212566e-04 -1.44240810e-04
 -1.96665042e-05  3.61155311e-04 -2.38097418e-05 -1.40672913e-04
  1.79930066e-04  1.85321624e-05 -2.27287819e-04 -5.39382600e-05
  1.11497655e-04  4.09221713e-04 -2.11956183e-04 -1.35426526e-04
 -9.27258952e-05 -3.18381390e-05  8.01294373e-05 -2.11412858e-04
 -1.09799308e-04  4.61659598e-04  3.45820270e-04 -5.60979424e-05
 -4.39650466e-05  2.40595240e-04  2.59514200e-04  2.14081709e-04
 -2.40453319e-05  1.07260828e-04 -4.91869105e-05  1.49703003e-04
  9.36621000e-05  4.11564761e-05 -3.22865642e-04 -7.14768394e-05
 -1.37804644e-04 -1.30039130e-04  3.40868683e-05 -6.06160502e-05
 -1.33106121e-04  2.94821744e-04  4.93076441e-05 -1.12611044e-04
 -1.3591301

In [9]:
import gensim
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client with URL
es = Elasticsearch(['http://localhost:9200'])

# Load the pre-trained FastText model
model = gensim.models.FastText.load('model.bin')

# Get user query
user_query = input('Enter your query: ')

index_name = 'my_index'

# Preprocess the user query
tokens = gensim.utils.simple_preprocess(user_query.lower())
stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
preprocessed_query = ' '.join(stemmed_tokens)

# Construct the Elasticsearch query for the current query using BM25 similarity
search_body = {
    'query': {
        'match': {
            'text': {
                'query': preprocessed_query,
                'analyzer': 'standard'
            }
        }
    },
    '_source': {
        'includes': ['text']
    }
}

# Execute the search and retrieve the top k documents
search_results = es.search(index=index_name, body=search_body)['hits']['hits']

for i, hit in enumerate(search_results[:12]):
    print(f'{i+1}. {hit["_id"]}')

1. 7975
2. 9295
3. 650
4. 14782
5. 3108
6. 541
7. 547
8. 18752
9. 8914
10. 795
11. 14352
12. 11028


In [6]:
len(search_results)

10

In [10]:
benchmark_queries = ["har write keyword tvtwm icon manag need help resourc bind tvtwm like icon manag iconifi window show icon list pan section virtual desktop try deiconifi window icon sp earlier like window deiconifi current region resourc us",
                     "es com blain gardner write articl bong kfp slac mac slac stanford edu bong slac stanford edu eric bong write articl gp cbnew",
                     "tmc spartan ac brocku ca tim ciceran subject turn photograph imag thermal print neg organ brock univers st catharin ontario newsread tin version pl line jennif",
                     "miss point think ll admit atheist lot sleev suspect nah encourag peopl learn atheism littl atheist sleev suspect actual meager want",
                     "nynexst com robert silver write send rush linbaugh clinton take awai right privaci govt standard take peopl lot monei drug dealer abl justifi de stuff slam clinton air rob recal rush sai compuserv account want mail need account number mail gatewai",
                     "enjoi lunch saturdai foodi milford nh assort nedod folk dean cookson ye left countri mention wire diagram",
                     "superior carleton ca mike richardson write lot good point mormon found father hardli great religi freedom histori read form opinion left practic religi freedom practic",
                     "acceler manufactur lasermast eden prairi mn willi vill walveranta tel fax linda av apt finland oakland ca fax automat recogn usa email",
                     "carolina state univers project eo line articl apr",
                     "univers"]

relevant_docs = [
    {"99","357","3248","5079","9773","12617","9794","1239","5701","3643"},
    {"139","7390","13104","8580","5532","8560","4259","18593","544","2231"},
    {"164","111","9973","3135","18786","1258","10921","2003","7819","7947"},
    {"209","11087","7258","3000","16918","8940","12373","11166","12000","5652"},
    {"812","5910","13658","8966","780","11642","18667","3269","7040","8780"},
    {"650","7975","9295","14780","3108","541","547","18752","8915","795"},
    {"652","18394","3180","14949","18614","18672","14840","2832","8668","18455"},
    {"456","411","6428","12170","2850","17151","13703","16548","4096","6969"},
    {"496","5580","7619","9393","8763","18500","3382","5552","5457","12792"},
    {"726","730","7945","700","17115","15673","7900","10929","14553","5311"}
]


In [None]:
relevant_docs = [
    {"99","357","3248","5079","9773","12617","9794","1239","5701","3643"},
    {"139","7390","13104","8580","5532","8560","4259","18593","544","2231"},
    {"164","111","9973","3135","18786","1258","10921","2003","7819","7947"},
    {"209","11087","7258","3000","16918","8940","12373","11166","12000","5652"},
    {"812","5910","13658","8966","780","11642","18667","3269","7040","8780"},
    {"650","7975","9295","14780","3108","541","547","18752","8915","795"},
    {"652","18394","3180","14949","18614","18672","14840","2832","8668","18455"},
    {"456","411","6428","12170","2850","17151","13703","16548","4096","6969"},
    {"496","5580","7619","9393","8763","18500","3382","5552","5457","12792"},
    {"726","730","7945","700","17115","15673","7900","10929","14553","5311"}
]

In [12]:
import math
from sklearn.metrics import average_precision_score
import numpy as np

def calculate_nDCG(ranked_relevance):
    ideal_ranking = sorted(ranked_relevance, reverse=True)

    # Calculate the discounted cumulative gain (DCG)
    dcg = 0
    for i in range(len(ranked_relevance)):
        dcg += (2 ** ranked_relevance[i] - 1) / (math.log2(i + 2))

    # Calculate the ideal discounted cumulative gain (IDCG)
    idcg = 0
    for i in range(len(ideal_ranking)):
        idcg += (2 ** ideal_ranking[i] - 1) / (math.log2(i + 2))

    # Calculate the nDCG
    if idcg == 0:
        return 0
    else:
        return dcg / idcg

# k = 10  # The number of top documents to consider

precision_scores = []
recall_scores = []
average_precision_scores = []
ndcg_scores = []

es = Elasticsearch(['http://localhost:9200'])

model = gensim.models.FastText.load('model.bin')


for i, query in enumerate(benchmark_queries):
    relevant_doc = relevant_docs[i]  # Retrieve relevant documents for the query

    # Preprocess the user query
    tokens = gensim.utils.simple_preprocess(query.lower())
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    preprocessed_query = ' '.join(stemmed_tokens)

    # Generate the vector for the user query by averaging the vectors of its tokens
    vector_sum = 0
    count = 0
    for token in preprocessed_query.split():
        if token in model.wv:
            vector_sum += model.wv[token]
            count += 1
    if count > 0:
        query_vector = vector_sum / count
    
    # Construct the Elasticsearch query for the current query
    search_body = {
        'query': {
            'script_score': {
                'query': {
                    'match_all': {}
                },
                'script': {
                    'source': 'cosineSimilarity(params.query_vector, "vector") + 1.0',
                    'params': {
                        'query_vector': query_vector.tolist()
                    }
                }
            }
        },
        '_source': {
            'includes': ['text']
        }
    }

    # Execute the search and retrieve the top 10 documents
    search_results = es.search(index='my_index', body=search_body)['hits']['hits']

    # Calculate Precision@K and Recall@K
    retrieved_doc_ids = [hit["_id"] for hit in search_results]

    # Initialize variables for TP, TN, FP, FN
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    # Calculate TP, TN, FP, FN
    for doc in retrieved_doc_ids:
        if doc in relevant_doc:
            TP += 1
        else:
            FP += 1
    
    for doc in relevant_doc:
        if doc not in retrieved_doc_ids:
            FN += 1
    
    total_documents = len(retrieved_doc_ids)
    TN = total_documents - (TP + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    
    precision_scores.append(precision)
    recall_scores.append(recall)

    # Calculate Average Precision
    relevance_scores = [1 if doc_id in relevant_doc else 0 for doc_id in retrieved_doc_ids]
    k = len(relevant_doc)
    average_precision = average_precision_score(relevance_scores, range(1, k+1))
    average_precision_scores.append(average_precision)

    # Calculate nDCG
    ranked_relevance = [1 if doc_id in relevant_doc else 0 for doc_id in retrieved_doc_ids]
    ndcg = calculate_nDCG(ranked_relevance)
    ndcg_scores.append(ndcg)

    # Calculate the ideal ranking
    ideal_ranking = sorted(ranked_relevance, reverse=True)

    # Calculate the discounted cumulative gain (DCG)
    dcg = 0
    for i in range(len(ranked_relevance)):
        dcg += (2 ** ranked_relevance[i] - 1) / (math.log2(i + 2))

    # Calculate the ideal discounted cumulative gain (IDCG)
    idcg = 0
    for i in range(len(ideal_ranking)):
        idcg += (2 ** ideal_ranking[i] - 1) / (math.log2(i + 2))

    # Calculate the nDCG
    if idcg == 0:
        ndcg = 0
    else:
        ndcg = dcg / idcg
    ndcg_scores.append(ndcg)

# Calculate the mean of each metric
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_average_precision = np.mean(average_precision_scores)
mean_ndcg = np.mean(ndcg_scores)

# Print or return the results
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean Average Precision: {mean_average_precision}")
print(f"Mean nDCG: {mean_ndcg}")


Mean Precision: 0.3
Mean Recall: 0.3
Mean Average Precision: 0.31157407407407406
Mean nDCG: 0.7795429892889458




In [13]:
import math
from sklearn.metrics import average_precision_score
import numpy as np

def calculate_nDCG(ranked_relevance):
    ideal_ranking = sorted(ranked_relevance, reverse=True)

    # Calculate the discounted cumulative gain (DCG)
    dcg = 0
    for i in range(len(ranked_relevance)):
        dcg += (2 ** ranked_relevance[i] - 1) / (math.log2(i + 2))

    # Calculate the ideal discounted cumulative gain (IDCG)
    idcg = 0
    for i in range(len(ideal_ranking)):
        idcg += (2 ** ideal_ranking[i] - 1) / (math.log2(i + 2))

    # Calculate the nDCG
    if idcg == 0:
        return 0
    else:
        return dcg / idcg

# k = 10  # The number of top documents to consider

precision_scores = []
recall_scores = []
average_precision_scores = []
ndcg_scores = []

es = Elasticsearch(['http://localhost:9200'])

model = gensim.models.FastText.load('model.bin')


for i, query in enumerate(benchmark_queries):
    relevant_doc = relevant_docs[i]  # Retrieve relevant documents for the query

    # Preprocess the user query
    tokens = gensim.utils.simple_preprocess(query.lower())
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    preprocessed_query = ' '.join(stemmed_tokens)
    
    # Construct the Elasticsearch query for the current query
    search_body = {
        'query': {
            'match': {
                'text': {
                    'query': preprocessed_query,
                    'analyzer': 'standard'
                }
            }
        },
        '_source': {
            'includes': ['text']
        }
    }

    # Execute the search and retrieve the top 10 documents
    search_results = es.search(index='my_index', body=search_body)['hits']['hits']

    # Calculate Precision@K and Recall@K
    retrieved_doc_ids = [hit["_id"] for hit in search_results]

    # Initialize variables for TP, TN, FP, FN
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    # Calculate TP, TN, FP, FN
    for doc in retrieved_doc_ids:
        if doc in relevant_doc:
            TP += 1
        else:
            FP += 1
    
    for doc in relevant_doc:
        if doc not in retrieved_doc_ids:
            FN += 1
    
    total_documents = len(retrieved_doc_ids)
    TN = total_documents - (TP + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    
    precision_scores.append(precision)
    recall_scores.append(recall)

    # Calculate Average Precision
    relevance_scores = [1 if doc_id in relevant_doc else 0 for doc_id in retrieved_doc_ids]
    k = len(relevant_doc)
    average_precision = average_precision_score(relevance_scores, range(1, k+1))
    average_precision_scores.append(average_precision)

    # Calculate nDCG
    ranked_relevance = [1 if doc_id in relevant_doc else 0 for doc_id in retrieved_doc_ids]
    ndcg = calculate_nDCG(ranked_relevance)
    ndcg_scores.append(ndcg)

    # Calculate the ideal ranking
    ideal_ranking = sorted(ranked_relevance, reverse=True)

    # Calculate the discounted cumulative gain (DCG)
    dcg = 0
    for i in range(len(ranked_relevance)):
        dcg += (2 ** ranked_relevance[i] - 1) / (math.log2(i + 2))

    # Calculate the ideal discounted cumulative gain (IDCG)
    idcg = 0
    for i in range(len(ideal_ranking)):
        idcg += (2 ** ideal_ranking[i] - 1) / (math.log2(i + 2))

    # Calculate the nDCG
    if idcg == 0:
        ndcg = 0
    else:
        ndcg = dcg / idcg
    ndcg_scores.append(ndcg)

# Calculate the mean of each metric
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_average_precision = np.mean(average_precision_scores)
mean_ndcg = np.mean(ndcg_scores)

# Print or return the results
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean Average Precision: {mean_average_precision}")
print(f"Mean nDCG: {mean_ndcg}")


Mean Precision: 0.55
Mean Recall: 0.55
Mean Average Precision: 0.5195287226001513
Mean nDCG: 0.8183706974277765




In [14]:
import csv

# Create a list to store the data for the CSV file
csv_data = []

precision_scores = []
recall_scores = []
average_precision_scores = []
ndcg_scores = []

es = Elasticsearch(['http://localhost:9200'])

model = gensim.models.FastText.load('model.bin')

for i, query in enumerate(benchmark_queries):
    relevant_doc = relevant_docs[i]  # Retrieve relevant documents for the query

    # Preprocess the user query
    tokens = gensim.utils.simple_preprocess(query.lower())
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    preprocessed_query = ' '.join(stemmed_tokens)

    # Generate the vector for the user query by averaging the vectors of its tokens
    vector_sum = 0
    count = 0
    for token in preprocessed_query.split():
        if token in model.wv:
            vector_sum += model.wv[token]
            count += 1
    if count > 0:
        query_vector = vector_sum / count
    
    # Construct the Elasticsearch query for the current query
    search_body = {
        'query': {
            'script_score': {
                'query': {
                    'match_all': {}
                },
                'script': {
                    'source': 'cosineSimilarity(params.query_vector, "vector") + 1.0',
                    'params': {
                        'query_vector': query_vector.tolist()
                    }
                }
            }
        },
        '_source': {
            'includes': ['text']
        }
    }

    # Execute the search and retrieve the top 10 documents
    search_results = es.search(index='my_index', body=search_body)['hits']['hits']

    # Create a list of lists with the data for the current query
    query_data = []
    for hit in search_results:
        doc_id = hit["_id"]
        cosine_score = hit["_score"]
        is_relevant = 1 if doc_id in relevant_doc else 0
        query_data.append([query, doc_id, cosine_score - 1, is_relevant])

    # Append the query data to the CSV data
    csv_data.extend(query_data)

# Define the CSV file path
csv_file_path = "retrieval_results_wordvec.csv"

# Write the data to a CSV file
with open(csv_file_path, "w", newline="") as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write the header row
    csv_writer.writerow(["Query", "Doc_ID", "Cosine_Score", "Relevance_Judgment"])
    # Write the query data
    csv_writer.writerows(csv_data)

# Print or return the results
print(f"CSV file saved: {csv_file_path}")


CSV file saved: retrieval_results_wordvec.csv


In [15]:
import csv

# Create a list to store the data for the CSV file
csv_data = []

precision_scores = []
recall_scores = []
average_precision_scores = []
ndcg_scores = []

es = Elasticsearch(['http://localhost:9200'])

model = gensim.models.FastText.load('model.bin')

for i, query in enumerate(benchmark_queries):
    relevant_doc = relevant_docs[i]  # Retrieve relevant documents for the query

    # Preprocess the user query
    tokens = gensim.utils.simple_preprocess(query.lower())
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    preprocessed_query = ' '.join(stemmed_tokens)
    
    # Construct the Elasticsearch query for the current query
    search_body = {
        'query': {
            'match': {
                'text': {
                    'query': preprocessed_query,
                    'analyzer': 'standard'
                }
            }
        },
        '_source': {
            'includes': ['text']
        }
    }

    # Execute the search and retrieve the top 10 documents
    search_results = es.search(index='my_index', body=search_body)['hits']['hits']

    # Create a list of lists with the data for the current query
    query_data = []
    for hit in search_results:
        doc_id = hit["_id"]
        cosine_score = hit["_score"]
        is_relevant = 1 if doc_id in relevant_doc else 0
        query_data.append([query, doc_id, cosine_score - 1, is_relevant])

    # Append the query data to the CSV data
    csv_data.extend(query_data)

# Define the CSV file path
csv_file_path = "retrieval_results_bm25.csv"

# Write the data to a CSV file
with open(csv_file_path, "w", newline="") as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write the header row
    csv_writer.writerow(["Query", "Doc_ID", "Cosine_Score", "Relevance_Judgment"])
    # Write the query data
    csv_writer.writerows(csv_data)

# Print or return the results
print(f"CSV file saved: {csv_file_path}")


CSV file saved: retrieval_results_bm25.csv


In [17]:
import csv

# Create a list to store the precision and recall data
precision_recall_data = []

k = 10  # The number of top documents to consider

es = Elasticsearch(['http://localhost:9200'])

model = gensim.models.FastText.load('model.bin')

for i, query in enumerate(benchmark_queries):
    relevant_doc = relevant_docs[i]  # Retrieve relevant documents for the query

    # Preprocess the user query
    tokens = gensim.utils.simple_preprocess(query.lower())
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    preprocessed_query = ' '.join(stemmed_tokens)

    # Generate the vector for the user query by averaging the vectors of its tokens
    vector_sum = 0
    count = 0
    for token in preprocessed_query.split():
        if token in model.wv:
            vector_sum += model.wv[token]
            count += 1
    if count > 0:
        query_vector = vector_sum / count
    
    # Construct the Elasticsearch query for the current query
    search_body = {
        'query': {
            'script_score': {
                'query': {
                    'match_all': {}
                },
                'script': {
                    'source': 'cosineSimilarity(params.query_vector, "vector") + 1.0',
                    'params': {
                        'query_vector': query_vector.tolist()
                    }
                }
            }
        },
        '_source': {
            'includes': ['text']
        }
    }

    # Execute the search and retrieve the top 10 documents
    search_results = es.search(index='my_index', body=search_body)['hits']['hits']

    # Calculate Precision@K and Recall@K
    retrieved_doc_ids = [hit["_id"] for hit in search_results]
    true_positives = len(set(retrieved_doc_ids).intersection(relevant_doc))
    false_positive = len(set(retrieved_doc_ids)) - true_positives
    false_negative = len(relevant_doc) - true_positives
    precision = true_positives / (true_positives + false_positive)
    recall = true_positives / (true_positives + false_negative)
    precision_scores.append(precision)
    recall_scores.append(recall)

    # Append precision and recall values for the current query
    precision_recall_data.append([query, precision, recall])

# Define the CSV file path
csv_file_path = "precision_recall_data_wordvec.csv"

# Write the precision and recall data to a CSV file
with open(csv_file_path, "w", newline="") as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write the header row
    csv_writer.writerow(["Query", "Precision", "Recall"])
    # Write the precision and recall data
    csv_writer.writerows(precision_recall_data)

# Print or return the results
print(f"CSV file saved: {csv_file_path}")


CSV file saved: precision_recall_data_wordvec.csv


In [18]:
import csv

# Create a list to store the precision and recall data
precision_recall_data = []

k = 10  # The number of top documents to consider

es = Elasticsearch(['http://localhost:9200'])

model = gensim.models.FastText.load('model.bin')

for i, query in enumerate(benchmark_queries):
    relevant_doc = relevant_docs[i]  # Retrieve relevant documents for the query

    # Preprocess the user query
    tokens = gensim.utils.simple_preprocess(query.lower())
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    preprocessed_query = ' '.join(stemmed_tokens)

    # Generate the vector for the user query by averaging the vectors of its tokens
    vector_sum = 0
    count = 0
    for token in preprocessed_query.split():
        if token in model.wv:
            vector_sum += model.wv[token]
            count += 1
    if count > 0:
        query_vector = vector_sum / count
    
    # Construct the Elasticsearch query for the current query
    search_body = {
        'query': {
            'script_score': {
                'query': {
                    'match_all': {}
                },
                'script': {
                    'source': 'cosineSimilarity(params.query_vector, "vector") + 1.0',
                    'params': {
                        'query_vector': query_vector.tolist()
                    }
                }
            }
        },
        '_source': {
            'includes': ['text']
        }
    }

    # Execute the search and retrieve the top 10 documents
    search_results = es.search(index='my_index', body=search_body)['hits']['hits']

    # Calculate Precision@K and Recall@K
    retrieved_doc_ids = [hit["_id"] for hit in search_results]
    true_positives = len(set(retrieved_doc_ids).intersection(relevant_doc))
    false_positive = len(set(retrieved_doc_ids)) - true_positives
    false_negative = len(relevant_doc) - true_positives
    precision = true_positives / (true_positives + false_positive)
    recall = true_positives / (true_positives + false_negative)
    precision_scores.append(precision)
    recall_scores.append(recall)

    # Append precision and recall values for the current query
    precision_recall_data.append([query, precision, recall])

# Define the CSV file path
csv_file_path = "precision_recall_data_bm25.csv"

# Write the precision and recall data to a CSV file
with open(csv_file_path, "w", newline="") as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write the header row
    csv_writer.writerow(["Query", "Precision", "Recall"])
    # Write the precision and recall data
    csv_writer.writerows(precision_recall_data)

# Print or return the results
print(f"CSV file saved: {csv_file_path}")


CSV file saved: precision_recall_data_bm25.csv
