<a href="https://colab.research.google.com/github/ranishrocks/cs367-ai-lab/blob/main/lab%202/%20plag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import heapq
import string
import numpy as np
import re


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def preprocess_text(doc):
    sentences = doc.split('.')
    return [s.strip().lower() for s in sentences if s.strip()]

def calculate_cosine_similarity(doc1_sentences, doc2_sentences):
    vectorizer = TfidfVectorizer()
    all_sentences = doc1_sentences + doc2_sentences
    tfidf_matrix = vectorizer.fit_transform(all_sentences)

    similarities = cosine_similarity(tfidf_matrix[:len(doc1_sentences)], tfidf_matrix[len(doc1_sentences):])
    return similarities


In [3]:
def levenshtein_distance(s1, s2):
    m, n = len(s1), len(s2)
    dp = np.zeros((m + 1, n + 1))
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif s1[i-1] == s2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
    return dp[m][n]


In [4]:
def heuristic(doc1_sentences, doc2_sentences, index1, index2):
    # Estimate based on the difference in remaining sentences
    remaining1 = len(doc1_sentences) - index1
    remaining2 = len(doc2_sentences) - index2
    return abs(remaining1 - remaining2)


In [5]:
def a_star_search(doc1_sentences, doc2_sentences):
    start_state = (0, 0, 0)  # (index in doc1, index in doc2, current cost)
    frontier = [(0, start_state)]  # Priority queue (cost, state)
    explored = set()

    # Store the alignment results
    alignments = []

    while frontier:
        cost, (i, j, current_cost) = heapq.heappop(frontier)

        if (i, j) in explored:
            continue

        explored.add((i, j))

        # Goal: all sentences from both documents are aligned
        if i == len(doc1_sentences) and j == len(doc2_sentences):
            return alignments

        # Align sentence i with sentence j
        if i < len(doc1_sentences) and j < len(doc2_sentences):
            align_cost = levenshtein_distance(doc1_sentences[i], doc2_sentences[j])
            next_cost = current_cost + align_cost
            heapq.heappush(frontier, (next_cost + heuristic(doc1_sentences, doc2_sentences, i+1, j+1),
                                      (i+1, j+1, next_cost)))
            alignments.append((i, j, align_cost))  # Save alignment information

        # Skip a sentence in doc1
        if i < len(doc1_sentences):
            skip_cost_doc1 = current_cost + len(doc1_sentences[i])  # Cost of skipping
            heapq.heappush(frontier, (skip_cost_doc1 + heuristic(doc1_sentences, doc2_sentences, i+1, j),
                                      (i+1, j, skip_cost_doc1)))

        # Skip a sentence in doc2
        if j < len(doc2_sentences):
            skip_cost_doc2 = current_cost + len(doc2_sentences[j])  # Cost of skipping
            heapq.heappush(frontier, (skip_cost_doc2 + heuristic(doc1_sentences, doc2_sentences, i, j+1),
                                      (i, j+1, skip_cost_doc2)))

    return alignments  # Return empty if no alignment found


In [17]:
def detect_plagiarism_cosine(doc1_sentences, doc2_sentences, threshold=0.3):
    plagiarism_cases = []
    similarities = calculate_cosine_similarity(doc1_sentences, doc2_sentences)

    for i in range(len(doc1_sentences)):
        for j in range(len(doc2_sentences)):
            if similarities[i, j] >= threshold:
                plagiarism_cases.append((doc1_sentences[i], doc2_sentences[j], similarities[i, j]))

    return plagiarism_cases


In [11]:
# Test Case 1: Identical Documents
doc1 = """Climate change is one of the most pressing issues of our time."""
doc2 = """Climate change is one of the most pressing issues of our time."""

# Preprocess both documents
doc1_sentences = preprocess_text(doc1)
doc2_sentences = preprocess_text(doc2)

# Run A* search to align sentences
alignments = a_star_search(doc1_sentences, doc2_sentences)

# Detect potential plagiarism
plagiarism_cases = detect_plagiarism(alignments, doc1_sentences, doc2_sentences, threshold=1)

# Output for Test Case 1
print("Test Case 1: Identical Documents")
if plagiarism_cases:
    for sentence1, sentence2, cost in plagiarism_cases:
        print(f"Aligned Sentence 1: {sentence1}")
        print(f"Aligned Sentence 2: {sentence2}")
        print(f"Edit Distance: {cost}\n")
else:
    print("No potential plagiarism detected.\n")


Test Case 1: Identical Documents
Aligned Sentence 1: climate change is one of the most pressing issues of our time
Aligned Sentence 2: climate change is one of the most pressing issues of our time
Edit Distance: 0.0



In [12]:
# Test Case 2: Slightly Modified Document
doc1 = """Climate change poses a serious threat to the planet."""
doc2 = """Global warming presents a significant risk to Earth."""

# Preprocess both documents
doc1_sentences = preprocess_text(doc1)
doc2_sentences = preprocess_text(doc2)

# Run A* search to align sentences
alignments = a_star_search(doc1_sentences, doc2_sentences)

# Detect potential plagiarism
plagiarism_cases = detect_plagiarism(alignments, doc1_sentences, doc2_sentences, threshold=5)

# Output for Test Case 2
print("Test Case 2: Slightly Modified Document")
if plagiarism_cases:
    for sentence1, sentence2, cost in plagiarism_cases:
        print(f"Aligned Sentence 1: {sentence1}")
        print(f"Aligned Sentence 2: {sentence2}")
        print(f"Edit Distance: {cost}\n")
else:
    # Show aligned sentences with their costs even if no potential plagiarism is detected
    print("No potential plagiarism detected. Here are the alignments with edit distances:")
    for i, j, cost in alignments:
        print(f"Alignment between Document 1 Sentence {i} and Document 2 Sentence {j}: Edit Distance: {cost}")


Test Case 2: Slightly Modified Document
No potential plagiarism detected. Here are the alignments with edit distances:
Alignment between Document 1 Sentence 0 and Document 2 Sentence 0: Edit Distance: 35.0


In [13]:
# Test Case 3: Completely Different Sentences
doc1 = """The moon shines brightly in the night sky."""
doc2 = """Photosynthesis is essential for plant growth."""

# Preprocess both documents
doc1_sentences = preprocess_text(doc1)
doc2_sentences = preprocess_text(doc2)

# Run A* search to align sentences
alignments = a_star_search(doc1_sentences, doc2_sentences)

# Detect potential plagiarism
plagiarism_cases = detect_plagiarism(alignments, doc1_sentences, doc2_sentences, threshold=5)

# Output for Test Case 3
print("Test Case 3: Completely Different Sentences")
if plagiarism_cases:
    for sentence1, sentence2, cost in plagiarism_cases:
        print(f"Aligned Sentence 1: {sentence1}")
        print(f"Aligned Sentence 2: {sentence2}")
        print(f"Edit Distance: {cost}\n")
else:
    print("No potential plagiarism detected.\n")


Test Case 3: Completely Different Sentences
No potential plagiarism detected.



In [18]:
doc1 = """Artificial Intelligence can improve efficiency in various tasks."""
doc2 = """AI technology has the potential to enhance productivity in many areas."""

# Preprocess both documents
doc1_sentences = preprocess_text(doc1)
doc2_sentences = preprocess_text(doc2)

# Detect potential plagiarism using cosine similarity
plagiarism_cases = detect_plagiarism_cosine(doc1_sentences, doc2_sentences, threshold=0.3)

# Output for Test Case 4
print("Test Case 4: Partial Overlap")
if plagiarism_cases:
    for sentence1, sentence2, sim in plagiarism_cases:
        print(f"Aligned Sentence 1: {sentence1}")
        print(f"Aligned Sentence 2: {sentence2}")
        print(f"Cosine Similarity: {sim}\n")
else:
    print("No potential plagiarism detected.\n")


Test Case 4: Partial Overlap
No potential plagiarism detected.

