## 1. Importing Libraries


In [None]:
import os
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## 2. Text Preprocessing

In [None]:
# Function to lowercase the text
def lower_case(text):
    return text.lower()

# Function to remove URLs
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

# Function to remove non-word characters (punctuation)
def non_word(text):
    return re.sub(r'\W+', ' ', text)

# Function to remove digits
def remove_digits(text):
    return re.sub(r'\d+', '', text)

# Function to tokenize the text
def tokenize(text):
    return word_tokenize(text)

# Function to remove stop words
def stop_word(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# Function to lemmatize the tokens
def lemmatize_tokens(tokens):
    lemmatizer = nltk.WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# Main text processing function
def process_text(text):
    text = lower_case(text)
    text = remove_urls(text)
    text = non_word(text)
    text = remove_digits(text)
    tokens = tokenize(text)
    tokens = stop_word(tokens)
    tokens = lemmatize_tokens(tokens)
    return tokens

# Sample input
text = "@ The reduces words to their root form, improving text analysis. The Porter Stemmer algorithm is commonly used 988234good283. https://t.co/GDrqU22YpT"

print(process_text(text))


['reduces', 'word', 'root', 'form', 'improving', 'text', 'analysis', 'porter', 'stemmer', 'algorithm', 'commonly', 'used', 'good']


## 3. Importing Folders containg .txt Files

In [None]:
# Folder containing the text files
folder_path = '/content/drive/MyDrive/PBS BSc IT/Year 3/Information Retrievsl System (TECH 400)/Week 2/Assignment Lab/Text Documents'

## 4. Creating Dictonary for Tokenized Documents

In [None]:
documents = {}

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        with open(os.path.join(folder_path, filename), 'r') as file:
            text = file.read()
            documents[filename] = process_text(text)

## 5. Creating an Inverted Index

In [None]:
from collections import defaultdict

inverted_index = defaultdict(list)

# Build the inverted index
for doc, tokens in documents.items():
    for token in set(tokens):  # Use set to avoid duplicates
        inverted_index[token].append(doc)


## 6. Query Processing (AND Queries)

In [None]:
def and_query(query, index):
    query_tokens = query.lower().split()

    # Get the list of documents for each term in the query
    doc_lists = [set(index[token]) for token in query_tokens if token in index]

    if not doc_lists:
        return []

    # Find intersection of all document lists (AND operation)
    result_docs = set.intersection(*doc_lists)
    return result_docs


## 7. Testing the Queries

In [None]:
while True:
    query = input("Enter AND query (or 'exit' to stop): ")
    if query == 'exit':
        break
    results = and_query(query, inverted_index)
    if results:
        print(f"Documents found: {', '.join(results)}")
    else:
        print("No documents found.")

No documents found.
Documents found: david_fincher.txt
