In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

# Load or create dataset
try:
    existing_content = joblib.load("content_dataset.pkl")
except FileNotFoundError:
    existing_content = [
        "Artificial intelligence is transforming industries with automation.",
        "Machine learning helps in predictive analytics for businesses.",
        "Data science combines statistics and AI to extract insights from data."
    ]
    joblib.dump(existing_content, "content_dataset.pkl")

vectorizer = TfidfVectorizer()
vectorizer.fit(existing_content)
joblib.dump(vectorizer, "vectorizer.pkl")

def check_plagiarism(user_content):
    if not user_content:
        return {"error": "No content provided"}
    
    # Load vectorizer and existing dataset
    vectorizer = joblib.load("vectorizer.pkl")
    existing_content = joblib.load("content_dataset.pkl")
    
    all_texts = existing_content + [user_content]
    tfidf_matrix = vectorizer.transform(all_texts)
    
    similarity_scores = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
    max_similarity = np.max(similarity_scores)
    
    return {"plagiarism_score": float(max_similarity)}

if __name__ == "__main__":
    user_input = input("Enter text to check for plagiarism: ")
    result = check_plagiarism(user_input)
    print(result)


Enter text to check for plagiarism:  Machine learning is useful for predictive analytics.


{'plagiarism_score': 0.7216878364870323}
