In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import os

In [None]:
df = pd.read_csv('data.csv')

#PREPROCESSING
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
    if isinstance(text, str):  #Check if the text is a string
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token.strip() and token not in stop_words]
        tokens = [token for token in tokens if token.strip()]
        return tokens
    else:
        return []  #Return an empty list if the text is not a string


In [None]:
df['SECTION_TEXT'] = df['SECTION_TEXT'].apply(lambda x: preprocess_text(x))

#empty dictionary to store unique words and their term frequencies
word_to_index = {}  

#VOCABULARY
#calculate term frequency for each word
for idx, tokens_list in enumerate(df['SECTION_TEXT']):
    #dictionary to store term frequencies for this section text
    term_frequency = {}
    for word in tokens_list:
        if isinstance(word, str):
            #Increment the term frequency
            term_frequency[word] = term_frequency.get(word, 0) + 1
            #If the word is not already in word_to_index dictionary, add it with its index
            if word not in word_to_index:
                word_to_index[word] = len(word_to_index)  #Assign a unique index to each unique word

In [None]:
for idx, tokens_list in enumerate(df['SECTION_TEXT']):
    term_frequency = {}
    for word in tokens_list:
        term_frequency[word] = term_frequency.get(word, 0) + 1

In [None]:
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, 'vocab.txt'), 'w') as f:
    f.write("Vocabulary:\n")
    for word, idx in sorted(word_to_index.items(), key=lambda x: x[1]):
        f.write(f"{idx} {word}\n")
    f.write("\n")

In [None]:
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, 'tf.txt'), 'w') as f:
    for idx, tokens_list in enumerate(df['SECTION_TEXT']):
        term_frequency = {}
        for word in tokens_list:
            term_frequency[word] = term_frequency.get(word, 0) + 1

        f.write(f"Term frequencies for Section {df['ARTICLE_ID'][idx]}:\n")
        for word, frequency in term_frequency.items():
            f.write(f"({word_to_index[word]}, {frequency}), ")
        f.write("\n\n")

In [None]:
#IDF or DOCUMENT FREQUENCY OF EACH WORD   
#Initialize a dictionary to store document frequency for each word
word_document_frequency = {}

#Loop through each word in the vocabulary
for word, index in word_to_index.items():
    #Initialize document frequency count for this word
    document_frequency = 0
    #Iterate through each section text
    for tokens_list in df['SECTION_TEXT']:
        #If the word appears, increment the document frequency count
        if word in tokens_list:
            document_frequency += 1
    #Store the document frequency count
    word_document_frequency[word] = document_frequency
    
print("Document Frequencies:")
for word, frequency in word_document_frequency.items():
    print(f"({word_to_index[word]}, {frequency})")

In [None]:
#TF/IDF WEIGHTS
#Calculate weights of each word
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, 'tfidf.txt'), 'w') as f:
    f.write("Weights for each word in the document:\n")
    for idx, tokens_list in enumerate(df['SECTION_TEXT']):
        term_frequency = {}
        for word in tokens_list:
            term_frequency[word] = term_frequency.get(word, 0) + 1

        f.write(f"Weights for Section {df['ARTICLE_ID'][idx]}:\n")
        for word, frequency in term_frequency.items():
            word_index = word_to_index[word]
            document_frequency = word_document_frequency[word]
            # Calculate the weight by dividing term frequency by document frequency
            weight = frequency / document_frequency
            f.write(f"({word_index}, {weight:.5f}), ")
        f.write("\n\n")

In [None]:
#VECTOR SPACE MODEL

#VECTOR LIST

#Initialize a list to store weights for each section text
section_weights = []

#Iterate through each section text
for tokens_list in df['SECTION_TEXT'][:10]:  
    term_frequency = {}
    section_weight = [0] * len(word_to_index)  #Initialize weights list for this section
    
    for word in tokens_list:
        term_frequency[word] = term_frequency.get(word, 0) + 1
    
    #Calculate weights for each word
    for word, frequency in term_frequency.items():
        if word in word_to_index:  #Check if the word exists in the vocabulary
            word_index = word_to_index[word]
            document_frequency = word_document_frequency[word]
            weight = frequency / document_frequency
            weight = round(weight, 5)
            section_weight[word_index] = weight  #Update weight for this word in the section
    
    #Append the weights to the section_weights list
    section_weights.append(section_weight)

for idx, weights in enumerate(section_weights):
    with open(os.path.join(output_dir, 'tfidf.txt'), 'a') as f:
        f.write(f"Weights for Section {df['ARTICLE_ID'][idx]}:\n")
        f.write(", ".join([f"({i}, {round(w, 5)})" for i, w in enumerate(weights) if w > 0]))
        f.write("\n\n")

In [None]:
#QUERY LIST

# Initialize a query list with zeros
query_list = [0] * len(word_to_index)

#Define the sentence to search for
query_sentence = input("Enter a sentence: ")
query_words = preprocess_text(query_sentence)

#Initialize a list to store the relevance scores for each word in the query sentence
word_relevance_scores = []

#Iterate through each word in the query sentence
for query_word in query_words:
    #Check if the query word exists in the vocabulary
    if query_word in word_to_index:
        query_word_index = word_to_index[query_word]
        
        #Initialize a counter to keep track of the number of section texts containing the query term 
        document_counter = 0
        
        #Iterate through each document
        for tokens_list in df['SECTION_TEXT']:
            #Check if the query word exists in the document
            if query_word in tokens_list:
                document_counter += 1  #increment the document counter
                
                #Calculate term frequency for the query word
                term_frequency = tokens_list.count(query_word)
                
                #Calculate the weight of the query word
                document_frequency = word_document_frequency[query_word]
                weight = term_frequency / document_frequency
                
                #Update the corresponding index in the query list with the weight
                query_list[query_word_index] += weight
        
        #If the query word exists in at least one section text
        if document_counter > 0:
            # Calculate the average weight of the query word across all section texts
            average_weight = query_list[query_word_index] / document_counter
            #Update the query list at the index corresponding to the query word
            query_list[query_word_index] = average_weight
    
    word_relevance_scores.append(query_list[query_word_index])

output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, 'query_list.txt'), 'w') as f:
    f.write("Query List:\n")
    f.write(", ".join(map(str, query_list)))

In [None]:
#Initialize a list to store the relevance scores for each section
section_relevance_scores = []

#Iterate through each section weight vector
for section_weight in section_weights:
    section_total_relevance_score = 0
    
    #Iterate through each word in the query sentence
    for query_word_index, word_relevance in enumerate(word_relevance_scores):
        #Multiply the weight of the word in the section_weight vector with the weight of the same word in the query list
        word_relevance = section_weight[query_word_index] * word_relevance
        
        #Add the relevance of this word to the total relevance score for this section
        section_total_relevance_score += word_relevance
    
    section_relevance_scores.append(section_total_relevance_score)

for idx, score in enumerate(section_relevance_scores):
    if score > 0:
        section_text = ' '.join(df['SECTION_TEXT'][idx])  
        with open(os.path.join(output_dir, 'relevance.txt'), 'a') as f:
            f.write(f"\nRelevance score for Section {df['ARTICLE_ID'][idx]}: {score}\n")
            f.write(f"Section text content: {section_text}\n")