In [1]:
import os
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from sklearn.metrics import confusion_matrix

# Make sure to download punkt (only once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# === PART 1: Read Text File ===
def read_text_file(file_path):
    if not os.path.exists(file_path):
        print("❌ File not found:", file_path)
        return None
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# === PART 2: Sentence Tokenization ===
def get_sentences(text):
    try:
        return sent_tokenize(text)
    except LookupError:
        nltk.download('punkt')
        return sent_tokenize(text)

# === PART 3: Plagiarism Checker (TF-IDF + Cosine Similarity) ===
def check_plagiarism(sentences, threshold=0.8):
    tfidf = TfidfVectorizer().fit_transform(sentences)
    cosine_sim = cosine_similarity(tfidf, tfidf)
    
    n = len(sentences)
    plagiarized_pairs = []
    for i in range(n):
        for j in range(i + 1, n):
            if cosine_sim[i][j] >= threshold:
                plagiarized_pairs.append((i, j, cosine_sim[i][j]))
    
    return plagiarized_pairs

# === PART 4: Accuracy vs Iterations Graph ===
def plot_accuracy_graph():
    iterations = np.arange(1, 11)
    accuracy = np.random.uniform(0.7, 0.95, size=10)  # Dummy accuracy values
    
    plt.figure()
    plt.plot(iterations, accuracy, marker='o')
    plt.title("Accuracy vs Iterations")
    plt.xlabel("Iterations")
    plt.ylabel("Accuracy")
    plt.grid(True)
    plt.savefig("accuracy_graph.png")
    plt.show()

# === PART 5: Confusion Matrix Heatmap (Dummy Data) ===
def plot_confusion_matrix():
    y_true = [0, 1, 2, 2, 0, 1, 2, 0, 1, 1]
    y_pred = [0, 2, 2, 2, 0, 0, 2, 0, 1, 1]
    labels = [0, 1, 2]

    cm = confusion_matrix(y_true, y_pred, labels=labels)
    
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix Heatmap')
    plt.savefig("confusion_matrix_heatmap.png")
    plt.show()

# === MAIN DRIVER ===
if __name__ == "__main__":
    file_path = "your_paper.txt"  # Change this to your paper path
    
    text = read_text_file(file_path)
    if text:
        sentences = get_sentences(text)
        
        if len(sentences) > 1:
            plagiarized = check_plagiarism(sentences)
            
            if plagiarized:
                print("🔍 Plagiarized Sentence Pairs:")
                for i, j, sim in plagiarized:
                    print(f"Sentence {i} & {j} - Similarity: {sim:.2f}")
            else:
                print("✅ No plagiarized pairs found.")
        else:
            print("⚠️ Not enough sentences for plagiarism check.")
        
        # Plot graphs
        plot_accuracy_graph()
        plot_confusion_matrix()


❌ File not found: your_paper.txt


In [2]:
pip install PyPDF2 nltk googlesearch-python


Collecting googlesearch-python
  Downloading googlesearch_python-1.3.0-py3-none-any.whl.metadata (3.4 kB)
Downloading googlesearch_python-1.3.0-py3-none-any.whl (5.6 kB)
Installing collected packages: googlesearch-python
Successfully installed googlesearch-python-1.3.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import fitz  # PyMuPDF
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load PDF and extract text
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print("❌ Error reading PDF:", e)
        return ""
    return text

# Split text into sentences using regex
def split_into_sentences(text):
    # This regex splits on punctuation followed by space and a capital letter (basic approximation)
    sentences = re.split(r'(?<=[.?!])\s+(?=[A-Z])', text)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 10]  # Filter short sentences
    return sentences

# Plagiarism check function using cosine similarity
def check_plagiarism(sentences, threshold=0.8):
    tfidf_vectorizer = TfidfVectorizer().fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_vectorizer)

    plagiarized_pairs = []
    for i in range(len(similarity_matrix)):
        for j in range(i + 1, len(similarity_matrix)):
            similarity_score = similarity_matrix[i][j]
            if similarity_score >= threshold:
                plagiarized_pairs.append((i, j, similarity_score))
    
    return plagiarized_pairs

# Main function
def main():
    pdf_path = r"C:\Users\sagni\Downloads\Band Selection\paper\IIIT Paper.pdf"
    print("📄 Reading PDF:", pdf_path)

    text = extract_text_from_pdf(pdf_path)
    if not text:
        print("⚠️ No text extracted from the PDF.")
        return

    sentences = split_into_sentences(text)
    print(f"📝 Total sentences extracted: {len(sentences)}")

    if len(sentences) < 2:
        print("⚠️ Not enough sentences to compare for plagiarism.")
        return

    print("🔍 Checking for similar (possibly plagiarized) sentences...")
    results = check_plagiarism(sentences)

    if not results:
        print("✅ No plagiarism detected above threshold.")
    else:
        print(f"⚠️ Detected {len(results)} possibly plagiarized sentence pairs:")
        for i, j, score in results:
            print(f"\n🔁 Similarity Score: {score:.2f}")
            print(f"Sentence {i + 1}: {sentences[i]}")
            print(f"Sentence {j + 1}: {sentences[j]}")

if __name__ == "__main__":
    main()


📄 Reading PDF: C:\Users\sagni\Downloads\Band Selection\paper\IIIT Paper.pdf
📝 Total sentences extracted: 113
🔍 Checking for similar (possibly plagiarized) sentences...
✅ No plagiarism detected above threshold.
