In [9]:
!pip install nltk
!pip install sklearn

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.isalpha()]

    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

def check_plagiarism(input_text, documents):
    # Preprocess the input text
    preprocessed_input = preprocess_text(input_text)

    # Preprocess the documents
    preprocessed_docs = [preprocess_text(doc) for doc in documents]

    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer().fit_transform([preprocessed_input] + preprocessed_docs)

    # Calculate cosine similarities
    similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()

    # Compare similarity scores to a threshold
    threshold = 0.8
    results = []
    for i, similarity in enumerate(similarities):
        if similarity > threshold:
            results.append({
                'document': documents[i],
                'similarity': similarity
            })

    return results

# Example usage
input_text = "This is my original content."
documents = [
    "This is my original content.",
    "This is some plagiarized content.",
    "This is another original document."
]

plagiarism_results = check_plagiarism(input_text, documents)

if plagiarism_results:
    print("Plagiarism detected!")
    for result in plagiarism_results:
        print("Similarity:", result['similarity'])
        print("Plagiarized document:", result['document'])
else:
    print("No plagiarism detected.")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Plagiarism detected!
Similarity: 1.0000000000000002
Plagiarized document: This is my original content.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
