In [3]:
import os
import numpy as np
import language_tool_python
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def load_document(file_path):
    """Loads a single text document."""
    if os.path.isfile(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    else:
        print("Error: File not found.")
        return None

def check_tfidf_cosine_similarity(input_text, database_text):
    """Checks plagiarism using TF-IDF and Cosine Similarity."""
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([database_text, input_text])
    similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return similarity_score

def check_semantic_similarity(input_text, database_text):
    """Checks plagiarism using SBERT (Semantic Similarity)."""
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings = model.encode([database_text, input_text], convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
    return similarity_score

def check_plagiarism(input_text, database_text):
    """Combines TF-IDF + Cosine Similarity + Semantic Similarity for plagiarism detection."""
    tfidf_score = check_tfidf_cosine_similarity(input_text, database_text)
    semantic_score = check_semantic_similarity(input_text, database_text)

    # Weighted Average (Adjust Weights Based on Importance)
    final_score = (0.5 * tfidf_score) + (0.5 * semantic_score)
    return final_score

def check_grammar_mistakes(text):
    """Counts grammar mistakes in the text using language_tool_python."""
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    return len(matches)

def check_ai_generated(input_text):
    """Uses a RoBERTa-based model to detect AI-generated text and adjusts based on grammar mistakes."""
    model_name = "roberta-base-openai-detector"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze()
    ai_score = probabilities[1].item()  # Probability of being AI-generated

    # Adjust AI detection score based on grammar mistakes (more mistakes → lower AI probability)
    grammar_mistakes = check_grammar_mistakes(input_text)
    adjusted_score = max(0, ai_score - (0.01 * grammar_mistakes))

    return adjusted_score

if __name__ == "__main__":
    database_file = input("Enter the path of the database document: ").strip()
    input_file = input("Enter the path of the document to check: ").strip()

    database_text = load_document(database_file)
    input_text = load_document(input_file)

    if database_text and input_text:
        plagiarism_score = check_plagiarism(input_text, database_text)
        ai_generated_score = check_ai_generated(input_text)

        print("\nPlagiarism Result:")
        print(f"Final Plagiarism Score (TF-IDF + Semantic): {plagiarism_score * 100:.2f}%")

        print("\nAI Detection Result:")
        print(f"AI-generated confidence score: {ai_generated_score * 100:.2f}%")

Enter the path of the database document: /content/shakespeare.txt.txt
Enter the path of the document to check: /content/manKa.txt


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base-openai-detector were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading LanguageTool 6.5: 100%|██████████| 248M/248M [00:04<00:00, 54.6MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp20zbfk1z.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-6.5.zip to /root/


Plagiarism Result:
Final Plagiarism Score (TF-IDF + Semantic): 37.68%

AI Detection Result:
AI-generated confidence score: 0.00%


In [2]:
!pip install language_tool_python


Collecting language_tool_python
  Downloading language_tool_python-2.9.0-py3-none-any.whl.metadata (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading language_tool_python-2.9.0-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: language_tool_python
Successfully installed language_tool_python-2.9.0
