In [13]:
#Q1

import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

def process_files(original_folder_path, processedFolder):
    if not os.path.exists(original_folder_path):
        print(f"Error: The folder '{original_folder_path}' does not exist.")
        return
    
    if not os.path.exists(processedFolder):
        os.makedirs(processedFolder)

    
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

   
    files_processed = 0

   
    printCount = 0

  
    processedTokens = {}

    
    for filename in os.listdir(original_folder_path):
        
        if filename.endswith(".txt"):
            originalFolder = os.path.join(original_folder_path, filename)
            processed_file_path = os.path.join(processedFolder, filename)

            with open(originalFolder, 'r') as file:
                content = file.read()

           
            content = content.lower()

            
            tokens = word_tokenize(content)

            
            tokens = [token for token in tokens if token not in stop_words]

           
            tokens = [''.join(char for char in token if char not in punctuation) for token in tokens]

           
            tokens = [token for token in tokens if token.strip()]

           
            processedTokens[filename] = tokens

            if printCount < 5:
                print(f"Original content of {filename} before processing:")
                print(content)
                print()
                print(f"Processed content of {filename} after processing:")
                print(" ".join(tokens))
                print()
                printCount += 1

           
            with open(processed_file_path, 'w') as file:
                file.write(" ".join(tokens))

            files_processed += 1

    return processedTokens





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SHOBHIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHOBHIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
#Q2

import os
import pickle

def read_processed_files(processedFolder):
    processedTokens = {}
    for filename in os.listdir(processedFolder):

        if filename.endswith(".txt"):
            file_path = os.path.join(processedFolder, filename)

            with open(file_path, 'r') as file:
                content = file.read()

            tokens = content.split()

            processedTokens[filename] = tokens
    return processedTokens

def create_iIndex(processedTokens):
    iIndex = {}

    for filename, tokens in processedTokens.items():

        for token in tokens:
            if token not in iIndex:
                iIndex[token] = [filename]
            else:
                if filename not in iIndex[token]:
                    iIndex[token].append(filename)
    return iIndex

def save_iIndex(iIndex, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(iIndex, file)

def load_iIndex(file_path):
    with open(file_path, 'rb') as file:
        iIndex = pickle.load(file)
    return iIndex

In [15]:
#Q3

def create_pIndex(processedFolder):
    pIndex = {}

    for filename in os.listdir(processedFolder):

        if filename.endswith(".txt"):
            file_path = os.path.join(processedFolder, filename)

            with open(file_path, 'r') as file:
                content = file.read()

            tokens = word_tokenize(content)

            unique_tokens = list(set(tokens))
            token_positions = {token: [i for i, t in enumerate(tokens) if t == token] for token in unique_tokens}

            for token, positions in token_positions.items():
                if token not in pIndex:
                    pIndex[token] = {filename: positions}
                else:
                    if filename not in pIndex[token]:
                        pIndex[token][filename] = positions
                    else:
                        pIndex[token][filename].extend(positions)
    return pIndex

def save_pIndex(pIndex, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(pIndex, file)

def load_pIndex(file_path):
    with open(file_path, 'rb') as file:
        pIndex = pickle.load(file)
    return pIndex

In [16]:
#RUN ALL 

processedFolder = "C:\\Users\\SHOBHIT\\Downloads\\irdatasetsA1Processed"
iIndexFile ="C:\\Users\\SHOBHIT\\Downloads\\iIndex.pickle" 
pIndexFile ="C:\\Users\\SHOBHIT\\Downloads\\pos_index.pickle" 

processedTokens = read_processed_files(processedFolder)

iIndex = create_iIndex(processedTokens)

save_iIndex(iIndex, iIndexFile)

loaded_iIndex = load_iIndex(iIndexFile)

print("Inverted Index:")
for key, value in loaded_iIndex.items():
    print(f"{key}: {value}")

pos_index = create_pIndex(processedFolder)
save_pIndex(pos_index, iIndexFile)
loaded_pos_index = load_pIndex(iIndexFile)
print("Positional Index:")
for key, value in loaded_pos_index.items():
    print(f"{key}: {value}")

Inverted Index:
loving: ['file1.txt', 'file254.txt', 'file391.txt', 'file723.txt']
vintage: ['file1.txt', 'file150.txt', 'file197.txt', 'file278.txt', 'file422.txt', 'file439.txt', 'file494.txt', 'file51.txt', 'file597.txt', 'file638.txt', 'file674.txt', 'file725.txt', 'file737.txt', 'file827.txt', 'file847.txt', 'file895.txt', 'file907.txt', 'file936.txt']
springs: ['file1.txt', 'file272.txt', 'file469.txt', 'file806.txt', 'file937.txt']
strat: ['file1.txt', 'file149.txt', 'file163.txt', 'file197.txt', 'file241.txt', 'file245.txt', 'file25.txt', 'file253.txt', 'file345.txt', 'file353.txt', 'file380.txt', 'file396.txt', 'file400.txt', 'file422.txt', 'file440.txt', 'file455.txt', 'file457.txt', 'file469.txt', 'file519.txt', 'file529.txt', 'file559.txt', 'file565.txt', 'file579.txt', 'file611.txt', 'file626.txt', 'file650.txt', 'file652.txt', 'file691.txt', 'file725.txt', 'file801.txt', 'file838.txt', 'file853.txt', 'file90.txt', 'file940.txt', 'file978.txt', 'file993.txt']
good: ['file1