In [2]:
import csv
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
import numpy as np
import pickle

In [3]:
questions_file = r"R:\Study\5th Sem\Information Retrieval\IR PROJECT\Questions_sample.csv"
answers_file = r"R:\Study\5th Sem\Information Retrieval\IR PROJECT\Answers_sample.csv"

In [4]:
# Define a list of stop words
stop_words = set([
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and',
    'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being',
    'below', 'between', 'both', 'but', 'by', 'can', 'could', 'couldn\'t', 'did',
    'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each',
    'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have',
    'haven\'t', 'having', 'he', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself',
    'him', 'himself', 'his', 'how', 'i', 'i\'m', 'if', 'in', 'into', 'is', 'isn\'t',
    'it', 'it\'s', 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'might', 'mightn\'t',
    'more', 'most', 'must', 'mustn\'t', 'my', 'myself', 'needn\'t', 'no', 'nor',
    'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our',
    'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan\'t', 'she',
    'she\'s', 'should', 'should\'ve', 'so', 'some', 'such', 't', 'than', 'that',
    'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there',
    'there\'s', 'these', 'they', 'they\'re', 'this', 'those', 'through', 'to', 'too',
    'under', 'until', 'up', 've', 'very', 'was', 'wasn\'t', 'we', 'we\'re', 'were',
    'weren\'t', 'what', 'what\'s', 'when', 'where', 'which', 'while', 'who', 'who\'s',
    'whom', 'why', 'will', 'with', 'won\'t', 'would', 'wouldn\'t', 'you', 'you\'re',
    'your', 'yours', 'yourself', 'yourselves'
])

In [5]:
idf = {}
tfidf_documents = {}
question_titles = {}

In [6]:
with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\TF-IDF\idf.pkl', 'rb') as f:
    idf = pickle.load(f)
with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\TF-IDF\tfidf_documents.pkl', 'rb') as f:
    tfidf_documents = pickle.load(f)
with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\TF-IDF\question_titles.pkl', 'rb') as f:
    question_titles = pickle.load(f)

In [7]:
# Function to build inverted index and precompute TF-IDF
def precompute_tfidf(questions_file):
    global idf, tfidf_documents, question_titles
    term_frequencies = defaultdict(lambda: defaultdict(int))  # Term frequency per document
    df = defaultdict(int)  # Document frequency
    N = 0  # Total number of documents

    with open(questions_file, 'r', encoding='latin-1') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header row if it exists
        for row in reader:
            question_id = row[0]
            title = row[5]
            if title:  # Skip empty titles
                title = title.lower()  # Normalize text
                words = title.split()  # Tokenize
                N += 1
                question_titles[question_id] = title
                unique_words_in_doc = set()

                # Filter out stop words and build term frequency
                for word in words:
                    if word not in stop_words:
                        term_frequencies[question_id][word] += 1
                        unique_words_in_doc.add(word)

                # Update document frequency
                for word in unique_words_in_doc:
                    df[word] += 1

    # Precompute IDF values
    idf = {word: math.log(N / (1 + freq)) for word, freq in df.items()}  # Add 1 to avoid division by zero

    # Precompute TF-IDF vectors for all documents
    for question_id, term_freqs in term_frequencies.items():
        total_terms = sum(term_freqs.values())
        tfidf_documents[question_id] = {
            word: (freq / total_terms) * idf[word] for word, freq in term_freqs.items()
        }

# Function to calculate cosine similarity
def cosine_similarity(tf_query, tf_candidate):
    dot_product = 0
    norm_query = 0
    norm_candidate = 0

    # Calculate cosine similarity for TF-IDF
    for word in tf_query:
        if word in tf_candidate:
            dot_product += tf_query[word] * tf_candidate[word]

    # Calculate norms for TF-IDF vectors
    for value in tf_query.values():
        norm_query += value * value
    for value in tf_candidate.values():
        norm_candidate += value * value

    # Final cosine similarity for TF-IDF
    if norm_query == 0 or norm_candidate == 0:
        return 0  # Avoid division by zero

    return dot_product / (norm_query**0.5 * norm_candidate**0.5)

def process_query(query_question, answers_file, display_tfidf=False, display_similarity=False, display_answer = False):
    query_terms = query_question.lower().split()
    query_term_frequency = defaultdict(int)

    # Calculate query term frequency
    for term in query_terms:
        if term not in stop_words:
            query_term_frequency[term] += 1
            

    # Normalize query term frequencies and calculate query TF-IDF
    total_query_terms = sum(query_term_frequency.values())
    tf_query = {
        term: (freq / total_query_terms) * idf.get(term, 0)
        for term, freq in query_term_frequency.items()
    }

    # Compute cosine similarity with precomputed TF-IDF vectors
    similarities = {}s
    for question_id, tfidf in tfidf_documents.items():
        # Compute similarity using only TF-IDF
        similarity_score = cosine_similarity(tf_query, tfidf)
        similarities[question_id] = similarity_score

    # Get the top 10 question IDs based on similarity
    top_indices = sorted(similarities, key=similarities.get, reverse=True)[:10]

    print("Top questions and their corresponding answers:")
    with open(answers_file, 'r', encoding='latin-1') as file:
        reader = csv.reader(file)
        next(reader)
        answer_dict = {}
        for row in reader:
            answer_question_id = row[3]
            answer_content = row[5] 
            answer_dict[answer_question_id] = answer_content

    # Display the top questions with their TF-IDF scores and cosine similarity
    for rank, question_id in enumerate(top_indices, start=1):
        print(f"\nRank: {rank}")
        print(f"Question ID: {question_id}")
        print(f"Title: {question_titles[question_id]}")

        if display_tfidf:
            print("TF-IDF Scores for this question:")
            tfidf_scores = tfidf_documents[question_id]
            for term, score in sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True):
                print(f"  {term}: {score:.4f}")

        if display_similarity:
            print(f"Cosine Similarity: {similarities[question_id]:.4f}")

        if question_id in answer_dict and display_answer:
            print(f"Answer: {answer_dict[question_id]}")

In [8]:
"""
precompute_tfidf(questions_file)
"""

'\nprecompute_tfidf(questions_file)\n'

In [13]:
# with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\TF-IDF\idf.pkl', 'wb') as f:
#     pickle.dump(idf, f)
# with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\TF-IDF\tfidf_documents.pkl', 'wb') as f:
#     pickle.dump(tfidf_documents, f)
# with open(r'R:\Study\5th Sem\Information Retrieval\IR PROJECT\Saved Files\TF-IDF\question_titles.pkl', 'wb') as f:
#     pickle.dump(question_titles, f)

In [25]:
# Example query
query_question = "how to write for loops in python?"
# process_query(query_question, answers_file, display_tfidf=True, display_similarity=True, display_answer = False)
process_query(query_question, answers_file, display_tfidf=False, display_similarity=True, display_answer = False)

Top questions and their corresponding answers:

Rank: 1
Question ID: 23682170
Title: using for loops
Cosine Similarity: 0.6093

Rank: 2
Question ID: 2874270
Title: how to write this snippet in python?
Cosine Similarity: 0.5354

Rank: 3
Question ID: 9864520
Title: perl - regex how to write this in python?
Cosine Similarity: 0.5319

Rank: 4
Question ID: 29885220
Title: using objects in for of loops
Cosine Similarity: 0.5071

Rank: 5
Question ID: 12222410
Title: arrays and for loops
Cosine Similarity: 0.5002

Rank: 6
Question ID: 13022350
Title: for loops and arrays
Cosine Similarity: 0.5002

Rank: 7
Question ID: 10745670
Title: i am not sure how to use for loops to returns number of entries in table that are unique with python?
Cosine Similarity: 0.4755

Rank: 8
Question ID: 7936410
Title: loops and arrays in php
Cosine Similarity: 0.4658

Rank: 9
Question ID: 1349900
Title: multiple php while loops using the same query
Cosine Similarity: 0.4565

Rank: 10
Question ID: 18069380
Title: wri

In [19]:
# Example query
query_question = "java io error"
# process_query(query_question, answers_file, display_tfidf=True, display_similarity=True, display_answer = False)
process_query(query_question, answers_file, display_tfidf=False, display_similarity=True, display_answer = True)

Top questions and their corresponding answers:

Rank: 1
Question ID: 9838900
Title: java file io exceptions
Cosine Similarity: 0.6940

Rank: 2
Question ID: 25099640
Title: non-blocking io vs async io and implementation in java
Cosine Similarity: 0.6904

Rank: 3
Question ID: 5467890
Title: io in where clause
Cosine Similarity: 0.6455

Rank: 4
Question ID: 8100280
Title: java character io between java executions
Cosine Similarity: 0.6105

Rank: 5
Question ID: 39811070
Title: c - file io read and write error
Cosine Similarity: 0.6024

Rank: 6
Question ID: 4950390
Title: asp.net file io account
Cosine Similarity: 0.5640

Rank: 7
Question ID: 17884730
Title: io operation failure
Cosine Similarity: 0.5324
Answer: <p>I am just adding kwatford`s comment as answer in here.  What you need to change is </p>

<pre><code>filename = os.path.join(getMediaPath(),aFile)

newfile = os.path.join(getMediaPath() , 'happyEdited.txt')
</code></pre>


Rank: 8
Question ID: 22165860
Title: blocking io in nodejs

In [20]:
# Example query
query_question = "building a search engine"
# process_query(query_question, answers_file, display_tfidf=True, display_similarity=True, display_answer = False)
process_query(query_question, answers_file, display_tfidf=False, display_similarity=False, display_answer = True)

Top questions and their corresponding answers:

Rank: 1
Question ID: 20989620
Title: content search engine

Rank: 2
Question ID: 6915280
Title: php search engine for mysql

Rank: 3
Question ID: 35133590
Title: building sql query for search string
Answer: <p>Setting the SqlCommand.CommandText after you have passed it to SqlDataAdapter doesn't change the text stored in the adapter, moving the setting of the SqlCommand.Commandtext before the creation of the adapter seems to be a good fix....</p>

<pre><code>cmd.CommandText = Sql
Dim adapter = New SqlDataAdapter(cmd.CommandText, con.ConnectionString)
Dim dt As New DataTable()
adapter.Fill(dt)
</code></pre>

<p>but, wait, you still have a problem. When you pass a string (as the CommandText) to the adapter constructor it builds internally another SqlCommand using that string. This internal command has its Parameter collection empty. It doesn't know anything of the parameter collection created externally by your code.<br>
So, the real fix is 