In [16]:
# Importing necessary modules
import os
import re
import numpy as np
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

# Downloading required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
# Setting up stopwords and lemmatizer
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

In [17]:
# Loading documents from folder
def load_documents(folder_path):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0

    print(f'Scanning folder: {folder_path}')
    for filename in os.listdir(folder_path):
        print(f'Found file: {filename}')
        if filename.endswith(('.txt', '.doc', '.pdf')):
            filepath = os.path.join(folder_path, filename)
            content = ''
            if filename.endswith('.txt'):
                with open(filepath, 'r', encoding='utf-8') as file:
                    content = file.read()

            if content:
                data[doc_id] = content
                doc_id_to_filename[doc_id] = filename
                print(f'Loaded doc_id {doc_id} -> {filename}')
                doc_id += 1

    print(f'Total documents loaded: {len(data)}')
    return data, doc_id_to_filename

In [4]:
# Cleaning text by removing unwanted characters and lemmatizing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS and len(word) > 1]
    return ' '.join(cleaned_tokens)

In [5]:
# Building vector space model using TF-IDF
def build_vector_space_model(data):
    cleaned_docs = [clean_text(content) for content in data.values()]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(cleaned_docs)
    return tfidf_matrix, vectorizer

In [6]:
# Calculating similarity scores between documents
def calculate_similarity(tfidf_matrix, doc_id_to_filename):
    similarity_matrix = cosine_similarity(tfidf_matrix)
    num_docs = len(doc_id_to_filename)
    print('Similarity scores between documents: ')
    for i in range(num_docs):
        for j in range(i + 1, num_docs):
            score = similarity_matrix[i][j]
            doc1 = doc_id_to_filename[i]
            doc2 = doc_id_to_filename[j]
            print(f'Similarity between {doc1} and {doc2}: {score:.4f}')
    return similarity_matrix

In [None]:
def main():
    folder_path = './documents'
    os.makedirs(folder_path, exist_ok=True)

    data, doc_id_to_filename = load_documents(folder_path)

    # Print cleaned text preview
    for doc_id, content in data.items():
        cleaned = clean_text(content)
        print(f'Doc {doc_id} cleaned text (first 100 chars): {cleaned[:100]}...')

    # Build model
    tfidf_matrix, vectorizer = build_vector_space_model(data)

    # Calculate similarity (correct argument)
    similarity_matrix = calculate_similarity(tfidf_matrix)

    # Convert dictionary to list so index = doc_id
    filenames = [doc_id_to_filename[i] for i in range(len(doc_id_to_filename))]

    # Save results
    with open('similarity_results.txt', 'w', encoding='utf-8') as result_file:
        result_file.write('Similarity scores:\n')

        num_docs = len(filenames)
        for i in range(num_docs):
            for j in range(i + 1, num_docs):
                score = similarity_matrix[i][j]
                result_file.write(
                    f'{filenames[i]} and {filenames[j]}: {score:.4f}\n'
                )

    print('Results saved to similarity_results.txt')
    print('Include snapshots of the results and code in your report.')
    print('Upload the notebook to GitHub and share the link.')

if __name__ == '__main__':
    main()