In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import os
from docx import Document
import PyPDF2

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

def read_document(file_path):
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()

    try:
        if ext == '.txt':
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        
        elif ext == '.docx':
            doc = Document(file_path)
            return ' '.join([para.text for para in doc.paragraphs if para.text.strip()])
        
        elif ext == '.pdf':
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ''
                for page in reader.pages:
                    text += page.extract_text() or ''
                return text
        
        else:
            raise ValueError(f"Unsupported file type: {ext}")
    
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ''

folder_path = 'path/to/your/folder' 
files = [f for f in os.listdir(folder_path) if f.endswith(('.txt', '.docx', '.pdf'))]

if len(files) < 2:
    print("Please provide at least two documents (.txt, .docx, or .pdf) in the folder.")
    exit()

doc1_path = os.path.join(folder_path, files[0])
doc2_path = os.path.join(folder_path, files[1])

print(f"Reading documents: {files[0]} and {files[1]}")
doc1 = read_document(doc1_path)
doc2 = read_document(doc2_path)

if not doc1 or not doc2:
    print("One or both documents could not be read. Exiting.")
    exit()

doc1_clean = preprocess_text(doc1)
doc2_clean = preprocess_text(doc2)


tokens1 = word_tokenize(doc1_clean)
tokens2 = word_tokenize(doc2_clean)
print("Tokens Doc1:", tokens1[:10], "...") 
print("Tokens Doc2:", tokens2[:10], "...")


vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([doc1_clean, doc2_clean])
cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
print(f"Cosine Similarity: {cos_sim:.4f}") 