In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
import os
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def load_documents_from_files(file_paths):
    documents = []
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                document_text = file.read()
                words = word_tokenize(document_text.lower())
                words = [lemmatizer.lemmatize(word) for word in words]
                words = [word for word in words if word not in stop_words]
                cleaned_text = ' '.join(words)
                documents.append(cleaned_text)
        except FileNotFoundError:
            print(f"{file_path} file not found.")
    return documents


assignments = [os.path.join("assignments", file) for file in os.listdir("assignments")
                    if os.path.isfile(os.path.join("assignments", file)) and file.endswith(".txt")]

documents = load_documents_from_files(assignments)


model = Doc2Vec.load("/content/model.bin")

document_vectors = [model.infer_vector(doc.split()) for doc in documents]
similarity_matrix = cosine_similarity(document_vectors)


for i in range(len(documents)):
    for j in range(i + 1, len(documents)):
        similarity = similarity_matrix[i][j]
        print(f"Similarity between {os.path.basename(assignments[i])} and {os.path.basename(assignments[j])}: {similarity:.4f}")

print("\n")

similarity_threshold = 0.75
for i in range(len(documents)):
    for j in range(i + 1, len(documents)):
        if similarity_matrix[i][j] > similarity_threshold:
            print(f"Document {os.path.basename(assignments[i])} and Document {os.path.basename(assignments[j])} have high similarity! And may be copy!.")





Similarity between as5.txt and as6.txt: 0.9918
Similarity between as5.txt and as2.txt: 0.1670
Similarity between as5.txt and as4.txt: 0.9892
Similarity between as5.txt and as1.txt: 0.3427
Similarity between as5.txt and as3.txt: 0.2707
Similarity between as6.txt and as2.txt: 0.1799
Similarity between as6.txt and as4.txt: 0.9892
Similarity between as6.txt and as1.txt: 0.3610
Similarity between as6.txt and as3.txt: 0.2711
Similarity between as2.txt and as4.txt: 0.1818
Similarity between as2.txt and as1.txt: 0.3910
Similarity between as2.txt and as3.txt: 0.1865
Similarity between as4.txt and as1.txt: 0.3523
Similarity between as4.txt and as3.txt: 0.2931
Similarity between as1.txt and as3.txt: 0.1231


Document as5.txt and Document as6.txt have high similarity! And may be copy!.
Document as5.txt and Document as4.txt have high similarity! And may be copy!.
Document as6.txt and Document as4.txt have high similarity! And may be copy!.
