### AIRIW ASSIGNMENT (UE20CS332) - 1 : SEARCH ENGINE
### PES1UG20CS198, CS235, CS320, CS362

<img src="https://w7.pngwing.com/pngs/212/16/png-transparent-search-engine-optimization-search-engine-marketing-pay-per-click-seo-text-service-engineering.png" alt="SE" width="500" height="300">


#### IMPORTING LIBRARIES

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
import numpy as np
import re
import math
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

#### TASK 1 : Preprocessing of raw data
<img src="https://media.springernature.com/lw685/springer-static/image/chp%3A10.1007%2F978-981-16-9995-5_4/MediaObjects/523753_1_En_4_Fig4_HTML.png" alt="SE" width="500" height="300">


In [2]:
# Load the 13 .csv files and concatenate them into a single DataFrame
# Filter the DataFrame to include only the relevant columns

df = pd.DataFrame()
for i in range(1, 14):
    filename = f'Corpus/IRAhandle_tweets_{i}.csv'
    temp_df = pd.read_csv(filename, dtype = {
        'external_author_id': str,
        'alt_external_id': str,
        'tweet_id': str
    })
    df = pd.concat([df, temp_df])

df = df[['external_author_id', 'content', 'language', 'publish_date']]

# Filter for English tweets only
df = df[df['language'] == 'English']

# Storing original contents of the DataFrame to a CSV file
df.to_csv('raw_data.csv', index = False)

# Preprocessing
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))

# Remove URLs, mentions, and hashtags from the tweet content
df['content'] = df['content'].apply(lambda x: str(x)) # convert non-string values to string
df['content'] = df['content'].apply(lambda x: re.sub(r"http\S+", "", x) if isinstance(x, str) else x) # remove URLs
df['content'] = df['content'].apply(lambda x: ' '.join([stemmer.stem(word.lower()) for word in x.split() if word.lower() not in stop_words]) if isinstance(x, str) else x) # lowercase, remove stop words, and stem
# Remove all non-alphanumeric characters
df['content'] = df['content'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", ' ', x))

# Convert to lowercase and remove stop words
df['content'] = df['content'].apply(lambda x: ' '.join([stemmer.stem(word.lower()) for word in str(x).split() if word.lower() not in stop_words]))

# Reset the index of the DataFrame
df = df.reset_index(drop = True)

# Save the preprocessed DataFrame as a CSV file
df.to_csv('preprocessed_data.csv', index = False)

# Preview the DataFrame
print(df.head())

   external_author_id                                            content  \
0  906000000000000000  sit democrat us senat trial corrupt bare heard...   
1  906000000000000000  marshawn lynch arriv game anti trump shirt jud...   
2  906000000000000000  daughter fallen navi sailor deliv power monolo...   
3  906000000000000000  presid trump dedic presid cup golf tournament ...   
4  906000000000000000     19 000 respect nation anthem standforouranthem   

  language     publish_date  
0  English  10/1/2017 19:58  
1  English  10/1/2017 22:43  
2  English  10/1/2017 22:50  
3  English  10/1/2017 23:52  
4  English   10/1/2017 2:13  


#### TASK 2A : Generate Inverted Index
<img src="https://image3.slideserve.com/5581457/inverted-index-construction-l.jpg" alt="SE" width="400" height="350">



In [3]:
# Create a defaultdict to store the inverted index
inverted_index = defaultdict(list)

# Create a mapping of document ids to content
doc_id_mapping = {}
for i, row in df.iterrows():
    doc_id = i
    content = row['content']
    doc_id_mapping[doc_id] = {'external_author_id': row['external_author_id'], 'publish_date': row['publish_date']}
    
    # Split the preprocessed content into terms and add to the inverted index
    for term in content.split():
        inverted_index[term].append(doc_id)

# Write all inverted index to a text file        
with open('inverted_index.txt', 'w') as f:
    for term, doc_ids in inverted_index.items():
        f.write(f'{term}: {doc_ids}\n')

# Preview the first 10 inverted index
i = 0
for term, doc_ids in inverted_index.items():
    print(f'{term}: {doc_ids}')
    i += 1
    if(i == 10):
        break

sit: [0, 78, 82, 1137, 3159, 3363, 4025, 4462, 5859, 6653, 7216, 7354, 7354, 7845, 8383, 8648, 10373, 10931, 11629, 11787, 14306, 15081, 15189, 15765, 16335, 16404, 16584, 16726, 18542, 19028, 19419, 19814, 20197, 21018, 21833, 22647, 23404, 24169, 24953, 25315, 25447, 26009, 26371, 26371, 26376, 26760, 26897, 27477, 28115, 28313, 29560, 29602, 29813, 29837, 29967, 29972, 30154, 31892, 32204, 32658, 32875, 32915, 33558, 33795, 33806, 34089, 34294, 34963, 35183, 35555, 38052, 38313, 44919, 44988, 45015, 45097, 45315, 45315, 45376, 46198, 46368, 46638, 47108, 47454, 47508, 47529, 47724, 48107, 48238, 48400, 48416, 48445, 49296, 49345, 49565, 49784, 49924, 50076, 50238, 50359, 50908, 51166, 51208, 52041, 52745, 53436, 53689, 53765, 53898, 54349, 54508, 54911, 55171, 55701, 55796, 56432, 56603, 56892, 57237, 59074, 59477, 59679, 60536, 60576, 60585, 60867, 61182, 61307, 61708, 61881, 61977, 62853, 63048, 63829, 64129, 64479, 64517, 64571, 64746, 65188, 65196, 65226, 65552, 65911, 66243, 66

#### TASK 2B : Boolean Model

In [4]:
corpus = df['content'].tolist()
corpus_raw = pd.read_csv('raw_data.csv')['content'].tolist()

# Define a function to tokenize and clean the text
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert text to lowercase
    return text.split()

# Define the Boolean model function
def boolean_model(query):
    # Pre-process the query
    query = clean_text(query)
    
    # Split query into terms
    if not query:
        return []
    terms = query
    
    # Find matching documents for each term
    results = []
    for term in terms:
        if term in inverted_index:
            results.append(inverted_index[term])
        else:
            results.append(set())
    
    # Combine the sets using Boolean operators
    combined_results = set()
    for i, term_result in enumerate(results):
        term_result = set(term_result) # convert list to set
        if i == 0:
            combined_results = term_result
        else:
            if terms[i-1] == 'and':
                combined_results = combined_results.intersection(term_result)
            elif terms[i-1] == 'or':
                combined_results = combined_results.union(term_result)
    
    # Get the documents matching all terms
    # matching_docs = [corpus[i] for i in combined_results]
    matching_docs = [corpus_raw[i] for i in combined_results]

    
    return matching_docs

# Test the Boolean model
query = "trump and russia"
results = boolean_model(query)
print(f"{len(results)} matching documents found:\n")
i = 0

# Printing top 25 docs with contents
for result in results:
    print(result + "\n")
    i += 1
    if(i == 25):
      break

3397 matching documents found:

Leading Conservatives Predicted This Last Week: Obama Is Behind Illegal Leaks on Trump and Russia https://t.co/LDk8u5RHSr

#chaWITCH HUNT : Russia Probe Includes Tenants From Trump Tower https://t.co/tSPCpp4vBT https://t.co/rt0Z1CJEJs

NEW VIDEO: Trump still hasn't answered the Impeachable question. Did he or his minions coordinate with Russia? https://t.co/aC9yDR8Zds

A Democratic Congressman Just Caught Trump In 2 Massive Russia Lies https://t.co/gDVlCgSgsV https://t.co/PUSNUhUVUU

This is the lie Donald Trump told- refuted by his own son- that makes me believe he is criminally tied to Russia. https://t.co/adzdvQmoRP https://t.co/uHzCXfzmx4

If Flynn goes, he’d be the 3rd Trump staffer fired for promising Russia that Trump is their guy. Carter Page and Paul Manafort did that too.

TRUMP JUST LIFTED SANCTIONS ON RUSSIA'S FSB - this is the story being buried by his childish attack on Arnold at National Prayer Breakfast.

https://t.co/evcecv0g8z  1. Save 

#### TASK 3 : Handling wild card and phrase queries
<img src="http://www.gmayor.com/Images_2007/imgBC.jpg" alt="SE" width="500" height="400">


In [5]:
# Define a function to handle wildcard queries
def handle_wildcard_query(query):
    pattern = query.replace('*', '.*')
    regex = re.compile(pattern)
    matching_terms = [term for term in inverted_index.keys() if regex.match(term)]
    doc_ids = set([doc_id for term in matching_terms for doc_id in inverted_index[term]])
    return doc_ids

# Define a function to handle phrase queries
def handle_phrase_query(query):
    query = re.sub(r"http\S+", "", query)  # Remove URLs
    query = re.sub(r'[^\w\s]', '', query)  # Remove punctuation
    query_terms = query.lower().split()
    phrase_docs = []
    for i in range(len(df)):
        doc = df.iloc[i]
        doc_text = doc['content']
        for pos in range(len(doc_text.split())):
            if doc_text.split()[pos] == query_terms[0]:
                match = True
                for j in range(1, len(query_terms)):
                    if pos+j >= len(doc_text.split()):
                        match = False
                        break
                    next_term = doc_text.split()[pos+j]
                    if not next_term == query_terms[j]:
                        match = False
                        break
                if match:
                    phrase_docs.append(i)
                    break
    return phrase_docs

# Define a function to calculate precision and recall
def calc_precision_recall(relevant_docs, retrieved_docs):
    tp = len(set(relevant_docs) & set(retrieved_docs))
    fp = len(retrieved_docs) - tp
    fn = len(relevant_docs) - tp
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    return precision, recall


# Example usage
def query_app(wq, pq):
    wildcard_query = wq
    phrase_query = pq
    wildcard_doc_ids = handle_wildcard_query(wildcard_query)
    phrase_doc_ids = handle_phrase_query(phrase_query)
    print(f'Wild card query: {wildcard_query}, matching doc ids: {wildcard_doc_ids}')
    print(f'Phrase query: {phrase_query}, matching doc ids: {phrase_doc_ids}')

def query_pr_app(wq, pq, relevant_docs):
    wildcard_query = wq
    phrase_query = pq
    wildcard_doc_ids = handle_wildcard_query(wildcard_query)
    phrase_doc_ids = handle_phrase_query(phrase_query)
    print(f'Wild card query: {wildcard_query}, matching doc ids: {wildcard_doc_ids}')
    print(f'Phrase query: {phrase_query}, matching doc ids: {phrase_doc_ids}')
    print('---')
    print('Evaluation:')
    print(f'Number of relevant documents: {len(relevant_docs)}')
    wildcard_precision, wildcard_recall = calc_precision_recall(relevant_docs, wildcard_doc_ids)
    print(f'Wild card query precision: {wildcard_precision}, recall: {wildcard_recall}')
    phrase_precision, phrase_recall = calc_precision_recall(relevant_docs, phrase_doc_ids)
    print(f'Phrase query precision: {phrase_precision}, recall: {phrase_recall}')

In [6]:
# Passing queries
query_app("trump*", "donald trump")
query_app("krishna*", "hare krishna")
query_app("russia*", "russia moscow")

Wild card query: trump*, matching doc ids: {524288, 1, 3, 5, 1572870, 524295, 8, 1048584, 12, 524300, 1572878, 1572879, 16, 524306, 20, 524309, 22, 2097176, 26, 524315, 1572891, 29, 524318, 2097185, 1048610, 35, 39, 1048615, 1048616, 1048617, 43, 1048620, 1572914, 52, 1572917, 1572919, 1572922, 61, 62, 63, 1048640, 1572928, 1572926, 67, 524359, 1572937, 1048651, 1572940, 1048653, 83, 1572947, 1572949, 1572952, 90, 1572957, 96, 1572960, 99, 100, 101, 524388, 103, 524391, 1048677, 524394, 1572967, 524396, 1572972, 1048686, 1572971, 2097261, 113, 524402, 524403, 1048691, 119, 120, 2097272, 1048698, 1048700, 1572988, 524414, 1572989, 1572990, 132, 133, 1572996, 1572997, 1572998, 1572999, 1573000, 2097285, 140, 1573009, 524434, 1048725, 150, 2097302, 153, 524441, 524443, 1573020, 524447, 160, 161, 524448, 1573025, 1573027, 165, 524453, 2097318, 524456, 2097314, 1048746, 173, 2097326, 2097327, 1573041, 1048754, 524467, 1048755, 2097330, 1573046, 2097331, 1573048, 1573042, 1048763, 524477, 15

In [7]:
# Sample queries for precision and recall :-
wildcard_query = 'trump*'
phrase_query = 'donald trump'

# Sample relevant documents
relevant_docs = [5, 198, 242, 262, 555, 1433, 1870, 2100, 2270, 2736, 2750, 4010, 5874, 5893, 6115, 6152, 6387, 6511, 6687, 6738, 6781, 6898, 6926, 6930, 7658, 7736, 7756, 7780, 8171, 8591, 8702, 9340, 11136, 11198, 11277, 11305, 11697, 11984, 12140, 12182, 12236, 12243, 12276, 12339, 12438, 12640, 12699, 13230, 13266, 13320, 13359,31, 32, 53, 54, 55, 56, 57, 58, 59, 151, 169, 170, 171, 180, 198, 239, 240, 241, 243, 246, 247, 248, 249, 253, 254, 255, 256, 257, 258, 259, 263, 264, 265, 274, 304, 310, 311, 312, 314, 327, 332, 337, 640, 655, 2173, 2174, 2176, 2178, 2180, 2181, 2224, 2225, 2227, 2228, 2232, 2270, 2271, 2274, 2334, 2343]

# Perform queries and evaluate
query_pr_app(wildcard_query, phrase_query, relevant_docs)

Wild card query: trump*, matching doc ids: {524288, 1, 3, 5, 1572870, 524295, 8, 1048584, 12, 524300, 1572878, 1572879, 16, 524306, 20, 524309, 22, 2097176, 26, 524315, 1572891, 29, 524318, 2097185, 1048610, 35, 39, 1048615, 1048616, 1048617, 43, 1048620, 1572914, 52, 1572917, 1572919, 1572922, 61, 62, 63, 1048640, 1572928, 1572926, 67, 524359, 1572937, 1048651, 1572940, 1048653, 83, 1572947, 1572949, 1572952, 90, 1572957, 96, 1572960, 99, 100, 101, 524388, 103, 524391, 1048677, 524394, 1572967, 524396, 1572972, 1048686, 1572971, 2097261, 113, 524402, 524403, 1048691, 119, 120, 2097272, 1048698, 1048700, 1572988, 524414, 1572989, 1572990, 132, 133, 1572996, 1572997, 1572998, 1572999, 1573000, 2097285, 140, 1573009, 524434, 1048725, 150, 2097302, 153, 524441, 524443, 1573020, 524447, 160, 161, 524448, 1573025, 1573027, 165, 524453, 2097318, 524456, 2097314, 1048746, 173, 2097326, 2097327, 1573041, 1048754, 524467, 1048755, 2097330, 1573046, 2097331, 1573048, 1573042, 1048763, 524477, 15

#### TASK 4 : Retrieve relevant text using similarity index
<img src="https://www.researchgate.net/publication/233758931/figure/fig4/AS:668495956164616@1536393264903/Illustration-of-Cosine-Similarity-scoreq-1-a-1-costh-6.png" alt="SE" width="300" height="300">


In [8]:
# Define the corpus
corpus = df['content'].tolist()
corpus_raw = pd.read_csv('raw_data.csv')['content'].tolist()

# Define a function to tokenize and clean the text
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert text to lowercase
    return text.split()
        
# Define a function to retrieve documents using the inverted index and cosine similarity
def retrieve_using_cosine_similarity(query, num_docs = 5):
    # Tokenize and clean the query
    query_tokens = clean_text(query)

    # Retrieve documents containing at least one query term
    candidate_doc_ids = set()
    for query_token in query_tokens:
        if query_token in inverted_index:
            candidate_doc_ids.update(inverted_index[query_token])

    # Calculate the cosine similarity between the query and candidate documents
    candidate_docs = [corpus[doc_id] for doc_id in candidate_doc_ids]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(candidate_docs)
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Sort the candidate documents by cosine similarity in descending order and get the top documents
    document_indices = cosine_similarities.argsort()[::-1][:num_docs]
    top_documents = [(candidate_docs[index], cosine_similarities[index]) for index in document_indices]

    # Calculate precision and recall
    relevant_docs = set(df[df['content'].str.contains(query)]['content'].index.tolist())
    retrieved_docs = set([corpus.index(doc[0]) for doc in top_documents])
    true_positives = len(relevant_docs.intersection(retrieved_docs))
    precision = true_positives / len(retrieved_docs)
    recall = true_positives / len(relevant_docs)

    # Add reason for the rank of each document
    print(f"Showing top {num_docs} documents that are most similar to the query '{query}':\n")
    for i, (text, cosine_sim) in enumerate(top_documents):
        doc_id = corpus.index(text)
        print(f"Rank {i+1} (Cosine Similarity: {cosine_sim:.4f}):")
        print(f"Document ID: {doc_id}")
        print(corpus_raw[doc_id])
        print("Reason: The document has a high cosine similarity score with the query terms.\n")

    return top_documents, precision, recall

In [9]:
# Example usage
query = 'hare krishna'
top_documents, precision, recall = retrieve_using_cosine_similarity(query, num_docs = 5)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

query = 'vladimir putin'
top_documents, precision, recall = retrieve_using_cosine_similarity(query, num_docs = 5)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Showing top 5 documents that are most similar to the query 'hare krishna':

Rank 1 (Cosine Similarity: 0.2016):
Document ID: 201770
Growing Hare Krishna congregation to open $3.6 million temple in Baltimore County https://t.co/cT0k9snZs6 https://t.co/pHEBRVcYlf
Reason: The document has a high cosine similarity score with the query terms.

Rank 2 (Cosine Similarity: 0.1677):
Document ID: 423479
When I re enter my body upon waking, the first two words out my mouth are mother goddess (Hare) & Krishna.
Reason: The document has a high cosine similarity score with the query terms.

Rank 3 (Cosine Similarity: 0.1567):
Document ID: 446493
Me And My VP Are On A Mission From God, Or Mohammed, Or Krishna ... Or Oprah. #RejectedPrezCampaignSlogans http://t.co/WMl1o32Shd
Reason: The document has a high cosine similarity score with the query terms.

Rank 4 (Cosine Similarity: 0.1516):
Document ID: 845461
.@SimpsonArnold1 SimpsonArnold @JoesSweetpea Krishna @ifanaureylian Sofeefee @bowiescheekbone bo

#### TASK 5 : Retrieve relevant text using liklelihood language model

<img src="https://www.researchgate.net/publication/261994473/figure/fig1/AS:669489288998917@1536630093464/The-probability-of-log-likelihood-ratio-conditional-under-each-class-The-two.png" alt="SE" width="500" height="300">


In [10]:
# Define the corpus
corpus = df['content'].tolist()
corpus_raw = pd.read_csv('raw_data.csv')['content'].tolist()
# Tokenize the query and compute the query likelihood
def log_likelihood(query, num_docs, relevant_doc_ids, q):
    query = re.sub(r"http\S+", "", query)  # Remove URLs
    query = re.sub(r'[^\w\s]', '', query)  # Remove punctuation
    query_tokens = query.lower().split()
    query_likelihood = {}
    for token in query_tokens:
        if token in query_likelihood:
            query_likelihood[token] += 1
        else:
            query_likelihood[token] = 1
    query_length = sum(query_likelihood.values())
    for token in query_likelihood:
        query_likelihood[token] = query_likelihood[token] / query_length

    # Retrieve the documents that contain any of the query tokens
    retrieved_docs = set()
    for token in query_tokens:
        if token in inverted_index:
            retrieved_docs.update(inverted_index[token])

    # Compute the likelihood of each retrieved document
    doc_likelihoods = {}
    for doc_id in retrieved_docs:
        doc_tokens = corpus[doc_id].lower().split()
        doc_length = len(doc_tokens)
        likelihood = 0
        for token in query_likelihood:
            count = doc_tokens.count(token)
            token_likelihood = count / doc_length if count > 0 else 1 / (doc_length + 1)
            likelihood += math.log(token_likelihood) * query_likelihood[token]
        doc_likelihoods[doc_id] = likelihood

    # Rank the retrieved documents by their likelihood
    sorted_docs = sorted(doc_likelihoods.items(), key=lambda x: x[1], reverse=True)

    # Print the top 10 documents and their likelihoods
    
    for doc_id, likelihood in sorted_docs[:num_docs]:
        doc = corpus[doc_id]
        relevant = "relevant" if doc_id in relevant_doc_ids else "not relevant"
        print(f"Document ID: {doc_id}\nLikelihood: {likelihood:.4f}\n{corpus_raw[doc_id]}\n{relevant}\n")

    # Calculate precision and recall
    if(q == 1):
        retrieved_doc_ids = [doc_id for doc_id, _ in sorted_docs[:num_docs]]
        retrieved_docs = [corpus[doc_id] for doc_id in retrieved_doc_ids]
        relevant_retrieved = set(retrieved_doc_ids).intersection(set(relevant_doc_ids))
        precision = len(relevant_retrieved) / num_docs
        recall = len(relevant_retrieved) / len(relevant_doc_ids)
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")

In [11]:
# Define the query
query = "harekrishna"
log_likelihood(query, 5, [740353, 1593358, 814038, 836643, 840009, 840013, 28564, 567158, 934921, 1078196, 1380963, 1498746, 2114497], 1)
print("----------------------------------------------------------------------")
query = "vladimir putin"
log_likelihood(query, 5, [306, 307, 481, 481, 482, 487, 495, 498, 513, 516, 518,711642, 714984, 715043, 715760, 716572, 716913, 716945, 717028, 442682, 442719, 442730, 442826, 443732, 443732, 443997, 444147, 444515, 444517, 16757, 16769, 17596, 25676, 25689, 25896, 25904, 25912, 26006, 26406, 27105, 28081, 28830], 1)
print("----------------------------------------------------------------------")
query = "petrol price"
log_likelihood(query, 5, [], 0)

Document ID: 1593358
Likelihood: -1.9459
Jamaica and Haiti especially, we haven't forgotten you. #Prayers #HareKrishna #108Plus
relevant

Document ID: 740353
Likelihood: -2.0794
When you touch ground and the Bhagavad Gita touches a heart. #HareKrishna #Everywhere https://t.co/S3CHRAYGvZ
relevant

Precision: 0.4000
Recall: 0.1538
----------------------------------------------------------------------
Document ID: 961860
Likelihood: -0.3466
By Putin? https://t.co/Rt82GWlKtJ
not relevant

Document ID: 1300311
Likelihood: -0.3466
Сейчас в Омске  #Putin https://t.co/X7tAHMAuhj
not relevant

Document ID: 245945
Likelihood: -0.6931
Vladimir Putin. https://t.co/lHbXvTEBUX
not relevant

Document ID: 311547
Likelihood: -0.8959
Putin during the debate https://t.co/G7515hBTwv
not relevant

Document ID: 1017681
Likelihood: -0.8959
Trump to Putin:
not relevant

Precision: 0.0000
Recall: 0.0000
----------------------------------------------------------------------
Document ID: 1167943
Likelihood: -0.3

#### TASK 6 : Ranking of retrieved documents, ALREADY IMPLEMENTED IN ```TASK 4 & 5```
<img src="https://www.researchgate.net/publication/261392162/figure/fig3/AS:614164707962894@1523439685489/The-pseudo-relevance-feedback-process.png" alt="SE" width="650" height="300">


#### TASK 7 : Advanced search - Relevance feedback and reranking of results

<img src="https://d3i71xaburhd42.cloudfront.net/4ff1cdace751ec5109ef737273fa17e7c6673ab1/2-Figure1-1.png" alt="SE" width="500" height="300">


In [12]:
# Define the corpus
corpus = df['content'].tolist()
corpus_raw = pd.read_csv('raw_data.csv')['content'].tolist()
# Create a TF-IDF vectorizer and transform the corpus
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Define a function to retrieve documents using cosine similarity with relevance feedback
def retrieve_using_cosine_similarity_with_feedback(query, num_docs = 5, alpha = 1, beta = 0.75, gamma = 0.15):
    # Transform the query using the vectorizer
    query_vector = vectorizer.transform([query])

    # Calculate the cosine similarity between the query and all documents in the corpus
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Sort the documents by cosine similarity in descending order and get the top documents
    document_indices = cosine_similarities.argsort()[::-1][:num_docs]
    top_documents = [(corpus_raw[index], cosine_similarities[index]) for index in document_indices]

    # Print the top documents
    print(f"Showing top {num_docs} documents that are most similar to the query '{query}':\n")
    for i, (text, cosine_sim) in enumerate(top_documents):
        print(f"Rank {i+1} (Cosine Similarity: {cosine_sim:.4f}):")
        print(text)
        print("Reason: The document has a high cosine similarity score with the query.\n")

    # Get feedback from the user on the relevance of the search results
    relevant_doc_indices = []
    non_relevant_doc_indices = []
    for i, (text, cosine_sim) in enumerate(top_documents):
        feedback = input(f"Is document {i+1} relevant? (y/n): ")
        if feedback.lower() == 'y':
            relevant_doc_indices.append(document_indices[i])
            print(f"Document {i+1} was marked as relevant.")
        else:
            non_relevant_doc_indices.append(document_indices[i])
            print(f"Document {i+1} was not marked as relevant.")

    # Calculate the new query vector using the Rocchio algorithm
    relevant_doc_vectors = tfidf_matrix[relevant_doc_indices]
    non_relevant_doc_vectors = tfidf_matrix[non_relevant_doc_indices]
    new_query_vector = alpha * query_vector + beta * relevant_doc_vectors.mean(axis=0) - gamma * non_relevant_doc_vectors.mean(axis=0)

    # Calculate the cosine similarity between the new query vector and all documents in the corpus
    cosine_similarities = cosine_similarity(np.asarray(new_query_vector), tfidf_matrix).flatten()

    # Sort the documents by cosine similarity in descending order and get the top documents
    document_indices = cosine_similarities.argsort()[::-1][:num_docs]
    top_documents = [(corpus_raw[index], cosine_similarities[index]) for index in document_indices]

    # Print the reranked top documents
    print(f"\nShowing top {num_docs} reranked documents that are most similar to the query '{query}':\n")
    for i, (text, cosine_sim) in enumerate(top_documents):
        print(f"Rank {i+1} (Cosine Similarity: {cosine_sim:.4f}):")
        print(text)
        print("Reason: The document has a high cosine similarity score with the reranked query.\n")

retrieve_using_cosine_similarity_with_feedback("donald trump")


Showing top 5 documents that are most similar to the query 'donald trump':

Rank 1 (Cosine Similarity: 1.0000):
DONald Trump
Reason: The document has a high cosine similarity score with the query.

Rank 2 (Cosine Similarity: 1.0000):
or Donald Trump https://t.co/h5R6WGnykE
Reason: The document has a high cosine similarity score with the query.

Rank 3 (Cosine Similarity: 1.0000):
Donald Trump by Donald Trump
Reason: The document has a high cosine similarity score with the query.

Rank 4 (Cosine Similarity: 0.8506):
The Donald
Reason: The document has a high cosine similarity score with the query.

Rank 5 (Cosine Similarity: 0.8380):
Donald Trump Threatens To Sue Donald Trump For Being Donald Trump https://t.co/e0Ml3GbFYv #donaldtrump https://t.co/XADkm5WOp7
Reason: The document has a high cosine similarity score with the query.

Document 1 was not marked as relevant.
Document 2 was marked as relevant.
Document 3 was not marked as relevant.
Document 4 was marked as relevant.
Document 5 

In [13]:

# Define the corpus
corpus = df['content'].tolist()

# Create a TF-IDF vectorizer and transform the corpus
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Define a function to retrieve documents using cosine similarity with relevance feedback
def retrieve_using_cosine_similarity_with_feedback(query, num_docs = 5, alpha = 1, beta = 0.75, gamma = 0.15):
    # Transform the query using the vectorizer
    query_vector = vectorizer.transform([query])

    # Calculate the cosine similarity between the query and all documents in the corpus
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Sort the documents by cosine similarity in descending order and get the top documents
    document_indices = cosine_similarities.argsort()[::-1][:num_docs]
    top_documents = [(corpus[index], cosine_similarities[index]) for index in document_indices]

    # Print the top documents
    print(f"Showing top {num_docs} documents that are most similar to the query '{query}':\n")
    for i, (text, cosine_sim) in enumerate(top_documents):
        print(f"Rank {i+1} (Cosine Similarity: {cosine_sim:.4f}):")
        print(text)
        print("Reason: The document has a high cosine similarity score with the query.\n")

    # Get feedback from the user on the relevance of the search results
    relevant_doc_indices = []
    non_relevant_doc_indices = []
    for i, (text, cosine_sim) in enumerate(top_documents):
        feedback = input(f"Is document {i+1} relevant? (y/n): ")
        if feedback.lower() == 'y':
            relevant_doc_indices.append(document_indices[i])
            print(f"Document {i+1} was marked as relevant.")
        else:
            non_relevant_doc_indices.append(document_indices[i])
            print(f"Document {i+1} was not marked as relevant.")

    # Calculate the new query vector using the Rocchio algorithm
    relevant_doc_vectors = tfidf_matrix[relevant_doc_indices]
    non_relevant_doc_vectors = tfidf_matrix[non_relevant_doc_indices]
    new_query_vector = alpha * query_vector + beta * relevant_doc_vectors.mean(axis=0) - gamma * non_relevant_doc_vectors.mean(axis=0)

    # Convert the new query vector to a numpy array
    new_query_array = np.asarray(new_query_vector)

    # Calculate the cosine similarity between the new query vector and all documents in the corpus
    cosine_similarities = cosine_similarity(new_query_array, tfidf_matrix).flatten()

    # Sort the documents by cosine similarity in descending order and get the top documents
    document_indices = cosine_similarities.argsort()[::-1][:num_docs]
    top_documents = [(corpus[index], cosine_similarities[index]) for index in document_indices]

    # Print the reranked top documents
    print(f"\nShowing top {num_docs} reranked documents that are most similar to the query '{query}':\n")
    for i, (text, cosine_sim) in enumerate(top_documents):
        print(f"Rank {i+1} (Cosine Similarity: {cosine_sim:.4f}):")
        print(text)
        print("Reason: The document has a high cosine similarity score with the reranked query.\n")

retrieve_using_cosine_similarity_with_feedback("donald trump")


Showing top 5 documents that are most similar to the query 'donald trump':

Rank 1 (Cosine Similarity: 1.0000):
donald trump
Reason: The document has a high cosine similarity score with the query.

Rank 2 (Cosine Similarity: 1.0000):
donald trump
Reason: The document has a high cosine similarity score with the query.

Rank 3 (Cosine Similarity: 1.0000):
donald trump donald trump
Reason: The document has a high cosine similarity score with the query.

Rank 4 (Cosine Similarity: 0.8506):
donald
Reason: The document has a high cosine similarity score with the query.

Rank 5 (Cosine Similarity: 0.8380):
donald trump threaten sue donald trump donald trump donaldtrump
Reason: The document has a high cosine similarity score with the query.

Document 1 was not marked as relevant.
Document 2 was marked as relevant.
Document 3 was not marked as relevant.
Document 4 was marked as relevant.
Document 5 was not marked as relevant.

Showing top 5 reranked documents that are most similar to the query 

<img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRTHzqLsn_WKLUfopNRkUTg4SNUpZQ7Q1oLQw&usqp=CAU" alt="SE" width="500" height="300">
