In [37]:
import csv
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
import numpy as np
import pickle

In [38]:
questions_file = r"R:\Study\5th Sem\Information Retrieval\IR PROJECT\Questions_sample.csv"
answers_file = r"R:\Study\5th Sem\Information Retrieval\IR PROJECT\Answers_sample.csv"

In [39]:
# Define a list of stop words
stop_words = set([
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and',
    'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being',
    'below', 'between', 'both', 'but', 'by', 'can', 'could', 'couldn\'t', 'did',
    'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each',
    'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have',
    'haven\'t', 'having', 'he', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself',
    'him', 'himself', 'his', 'how', 'i', 'i\'m', 'if', 'in', 'into', 'is', 'isn\'t',
    'it', 'it\'s', 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'might', 'mightn\'t',
    'more', 'most', 'must', 'mustn\'t', 'my', 'myself', 'needn\'t', 'no', 'nor',
    'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our',
    'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan\'t', 'she',
    'she\'s', 'should', 'should\'ve', 'so', 'some', 'such', 't', 'than', 'that',
    'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there',
    'there\'s', 'these', 'they', 'they\'re', 'this', 'those', 'through', 'to', 'too',
    'under', 'until', 'up', 've', 'very', 'was', 'wasn\'t', 'we', 'we\'re', 'were',
    'weren\'t', 'what', 'what\'s', 'when', 'where', 'which', 'while', 'who', 'who\'s',
    'whom', 'why', 'will', 'with', 'won\'t', 'would', 'wouldn\'t', 'you', 'you\'re',
    'your', 'yours', 'yourself', 'yourselves'
])

In [40]:
import tensorflow_hub as hub
# Load Universal Sentence Encoder model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [41]:
idf = {} 
tfidf_documents = {} 
question_titles = {}
use_embeddings = {}

In [42]:
with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\HYBRID\idf.pkl', 'rb') as f:
    idf = pickle.load(f)
with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\HYBRID\tfidf_documents.pkl', 'rb') as f:
    tfidf_documents = pickle.load(f)
with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\HYBRID\question_titles.pkl', 'rb') as f:
    question_titles = pickle.load(f)
with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\HYBRID\use_embeddings.pkl', 'rb') as f:
    use_embeddings = pickle.load(f)

In [43]:
def normalize_scores(similarity_dict):
    if not similarity_dict:
        return {}
    
    max_score = max(similarity_dict.values())
    min_score = min(similarity_dict.values())

    if max_score == min_score:
        return {key: 1.0 for key in similarity_dict}

    normalized_dict = {
        key: (value - min_score) / (max_score - min_score)
        for key, value in similarity_dict.items()
    }
    return normalized_dict

In [44]:
def cosine_similarity_use(query_embed, candidate_embed):
    dot_product_embed = np.dot(query_embed, candidate_embed)
    norm_query_embed = np.linalg.norm(query_embed)
    norm_candidate_embed = np.linalg.norm(candidate_embed)

    if norm_query_embed == 0 or norm_candidate_embed == 0:
        return 0 

    return dot_product_embed / (norm_query_embed * norm_candidate_embed)

In [45]:
def cosine_similarity(tf_query, tf_candidate, use_query_embed, use_candidate_embed):
    dot_product = 0
    norm_query = 0
    norm_candidate = 0

    for word in tf_query:
        if word in tf_candidate:
            dot_product += tf_query[word] * tf_candidate[word]

    # Calculate norms for TF-IDF vectors
    for value in tf_query.values():
        norm_query += value * value
    for value in tf_candidate.values():
        norm_candidate += value * value

    # Cosine similarity for USE embeddings
    dot_product_embed = np.dot(use_query_embed, use_candidate_embed)
    norm_query_embed = np.linalg.norm(use_query_embed) # l2 norm(including sqrt bruh)
    norm_candidate_embed = np.linalg.norm(use_candidate_embed)

    # Final combined cosine similarity: TF-IDF similarity + USE similarity
    if norm_query == 0 or norm_candidate == 0:
        return 0  
        
    tfidf_similarity = dot_product / (norm_query**0.5 * norm_candidate**0.5)
    use_similarity = dot_product_embed / (norm_query_embed * norm_candidate_embed)

    # Return the average similarity (adjust the weight as needed)
    return 0.5 * tfidf_similarity + 0.5 * use_similarity

In [46]:
def precompute_tfidf(questions_file):
    global idf, tfidf_documents, question_titles, use_embeddings
    term_frequencies = defaultdict(lambda: defaultdict(int))  # Term frequency per document
    df = defaultdict(int)  # Document frequency
    N = 0  # Total number of documents

    with open(questions_file, 'r', encoding='latin-1') as file:
        reader = csv.reader(file)
        next(reader) 
        for row in reader:
            question_id = row[0]
            title = row[5]
            if title:  
                title = title.lower()  
                words = title.split()  
                N += 1
                question_titles[question_id] = title
                unique_words_in_doc = set()

                for word in words:
                    if word not in stop_words:
                        term_frequencies[question_id][word] += 1
                        unique_words_in_doc.add(word)

        
                for word in unique_words_in_doc:
                    df[word] += 1

    idf = {word: math.log(N / (1 + freq)) for word, freq in df.items()}  # Add 1 to avoid division by zero

    # Precompute TF-IDF vectors for all documents
    for question_id, term_freqs in term_frequencies.items():
        total_terms = sum(term_freqs.values())
        tfidf_documents[question_id] = {
            word: (freq / total_terms) * idf[word] for word, freq in term_freqs.items()
        }

    for question_id, title in question_titles.items():
        use_embeddings[question_id] = np.array(use_model([title])[0])

In [47]:
def process_query_combined(query_question, answers_file, display_answer = False):
    query_terms = query_question.lower().split()
    query_term_frequency = defaultdict(int)

    for term in query_terms:
        if term not in stop_words:
            query_term_frequency[term] += 1

    total_query_terms = sum(query_term_frequency.values())
    tf_query = {
        term: (freq / total_query_terms) * idf.get(term, 0)
        for term, freq in query_term_frequency.items()
    }

    # Get USE embedding for query
    query_embedding = np.array(use_model([query_question])[0])

    # Compute similarities
    tfidf_similarities = {}
    use_similarities = {}

    for question_id, tfidf in tfidf_documents.items():
        # TF-IDF similarity
        dot_product = sum(tf_query[word] * tfidf.get(word, 0) for word in tf_query)
        norm_query = sum(value**2 for value in tf_query.values())**0.5
        norm_candidate = sum(value**2 for value in tfidf.values())**0.5
        tfidf_similarities[question_id] = dot_product / (norm_query * norm_candidate + 1e-9)

        # USE similarity
        use_candidate_embed = use_embeddings[question_id]
        use_similarities[question_id] = cosine_similarity_use(query_embedding, use_candidate_embed)

    # Normalize scores
    tfidf_similarity = normalize_scores(tfidf_similarities)
    use_similarity = normalize_scores(use_similarities)

    # Combine scores
    alpha = 0.7 
    combined_scores = {
        question_id: alpha * tfidf_similarity[question_id] + (1 - alpha) * use_similarity[question_id]
        for question_id in tfidf_similarity
    }

    top_indices = sorted(combined_scores, key=combined_scores.get, reverse=True)[:10]

    print("Top questions and their corresponding answers:")
    with open(answers_file, 'r', encoding='latin-1') as file:
        reader = csv.reader(file)
        next(reader)
        answer_dict = {row[3]: row[5] for row in reader}

    for rank, question_id in enumerate(top_indices, start=1):
        print(f"\nRank: {rank}")
        print(f"Question ID: {question_id}")
        print(f"Title: {question_titles[question_id]}")
        print(f"Combined Similarity: {combined_scores[question_id]:.4f}")
        if question_id in answer_dict and display_answer:
            print(f"Answer: {answer_dict[question_id]}")

In [48]:
# precompute_tfidf(questions_file)

In [50]:
# with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\HYBRID\idf.pkl', 'wb') as f:
#     pickle.dump(idf, f)
# with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\HYBRID\tfidf_documents.pkl', 'wb') as f:
#     pickle.dump(tfidf_documents, f)
# with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\HYBRID\question_titles.pkl', 'wb') as f:
#     pickle.dump(question_titles, f)
# with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\HYBRID\use_embeddings.pkl', 'wb') as f:
#     pickle.dump(use_embeddings, f)

In [51]:
# Example query
query_question = "how to write for loops in python?"
process_query_combined(query_question, answers_file)

Top questions and their corresponding answers:

Rank: 1
Question ID: 23682170
Title: using for loops
Combined Similarity: 0.9221

Rank: 2
Question ID: 2874270
Title: how to write this snippet in python?
Combined Similarity: 0.9025

Rank: 3
Question ID: 9864520
Title: perl - regex how to write this in python?
Combined Similarity: 0.8315

Rank: 4
Question ID: 10745670
Title: i am not sure how to use for loops to returns number of entries in table that are unique with python?
Combined Similarity: 0.8117

Rank: 5
Question ID: 29885220
Title: using objects in for of loops
Combined Similarity: 0.7869

Rank: 6
Question ID: 36254600
Title: how to loop a code in python?
Combined Similarity: 0.7779

Rank: 7
Question ID: 18069380
Title: write bitarray to file in python?
Combined Similarity: 0.7758

Rank: 8
Question ID: 22591270
Title: how to iterate in python?
Combined Similarity: 0.7616

Rank: 9
Question ID: 12222410
Title: arrays and for loops
Combined Similarity: 0.7573

Rank: 10
Question ID: 

In [52]:
# Example query
query_question = "how to install python"
process_query_combined(query_question, answers_file)

Top questions and their corresponding answers:

Rank: 1
Question ID: 25467300
Title: why can't install python for android?
Combined Similarity: 0.9469

Rank: 2
Question ID: 4329330
Title: how to install a python package to windows?
Combined Similarity: 0.9035

Rank: 3
Question ID: 9146320
Title: install python mysqldb to python3 not python
Combined Similarity: 0.8742

Rank: 4
Question ID: 31050040
Title: install windows drivers using python
Combined Similarity: 0.8055

Rank: 5
Question ID: 27263620
Title: how to install anaconda python for all users?
Combined Similarity: 0.7850

Rank: 6
Question ID: 20622860
Title: cannot install python bottle on linux
Combined Similarity: 0.7835

Rank: 7
Question ID: 23428840
Title: install python packages on shared host
Combined Similarity: 0.7717

Rank: 8
Question ID: 28293270
Title: python api: c extension install error
Combined Similarity: 0.7662

Rank: 9
Question ID: 29996090
Title: getting python to wait for apt-get install
Combined Similarity: 

In [53]:
# Example query
query_question = "Building a search engine"
process_query_combined(query_question, answers_file)

Top questions and their corresponding answers:

Rank: 1
Question ID: 20989620
Title: content search engine
Combined Similarity: 0.9460

Rank: 2
Question ID: 6915280
Title: php search engine for mysql
Combined Similarity: 0.8806

Rank: 3
Question ID: 5612580
Title: page source search engine
Combined Similarity: 0.8806

Rank: 4
Question ID: 35133590
Title: building sql query for search string
Combined Similarity: 0.8401

Rank: 5
Question ID: 19970840
Title: creating a custom search engine with asp.net
Combined Similarity: 0.8293

Rank: 6
Question ID: 17892260
Title: simple php/sql search engine
Combined Similarity: 0.7624

Rank: 7
Question ID: 3594390
Title: creating custom search webpage using google engine
Combined Similarity: 0.7528

Rank: 8
Question ID: 17222810
Title: complex google app engine search
Combined Similarity: 0.7509

Rank: 9
Question ID: 33953250
Title: search engine based on google spreadsheet
Combined Similarity: 0.7462

Rank: 10
Question ID: 33994730
Title: create cus

In [54]:
# Example query
query_question = "Top linux commands"
process_query_combined(query_question, answers_file)

Top questions and their corresponding answers:

Rank: 1
Question ID: 11527720
Title: facl commands in linux
Combined Similarity: 0.9704

Rank: 2
Question ID: 39302440
Title: pipe output of a binary to linux commands
Combined Similarity: 0.9525

Rank: 3
Question ID: 6880520
Title: sending commands from android phone to linux desktop
Combined Similarity: 0.8944

Rank: 4
Question ID: 6311240
Title: understanding "intercepting" of commands in linux
Combined Similarity: 0.8656

Rank: 5
Question ID: 37729690
Title: how to run all the commands before last occurrence of specific command in linux
Combined Similarity: 0.8512

Rank: 6
Question ID: 31134730
Title: commands in java console
Combined Similarity: 0.7938

Rank: 7
Question ID: 11966010
Title: loop over sql commands in a file
Combined Similarity: 0.7746

Rank: 8
Question ID: 20115360
Title: continuous execution of commands in linux using ganymed ssh
Combined Similarity: 0.7507

Rank: 9
Question ID: 29518140
Title: what is the difference 