<a href="https://colab.research.google.com/github/ronaksingh27/IR_assgn5_pb2/blob/master/IR_assignment5_pb1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install scikit-learn nltk



In [10]:
import os
import random
import math
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import nltk

In [15]:

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:

def preprocess_document(document):
    """Preprocess the text: tokenization, stopword removal, stemming."""
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    # Tokenize the document
    tokens = word_tokenize(document.lower())

    # Remove stopwords and apply stemming
    processed_tokens = [stemmer.stem(word) for word in tokens if word.isalnum() and word not in stop_words]

    return ' '.join(processed_tokens)


In [6]:

def build_index(corpus_dir):
    """Build a TF-IDF index from a corpus directory."""
    documents = []
    doc_names = []

    for filename in os.listdir(corpus_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(corpus_dir, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                preprocessed_content = preprocess_document(content)
                documents.append(preprocessed_content)
                doc_names.append(filename)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)

    return tfidf_matrix, vectorizer, doc_names


In [7]:
def plagiarism_check(test_document, tfidf_matrix, vectorizer, doc_names):
    """Check plagiarism by comparing the test document to the corpus."""
    preprocessed_test_doc = preprocess_document(test_document)
    test_vector = vectorizer.transform([preprocessed_test_doc])

    # Calculate cosine similarity
    similarity_scores = cosine_similarity(test_vector, tfidf_matrix).flatten()

    # Rank documents by similarity
    similarity_ranking = sorted(zip(doc_names, similarity_scores), key=lambda x: x[1], reverse=True)

    # Calculate uniqueness as 1 - max similarity score
    max_similarity = max(similarity_scores)
    uniqueness = (1 - max_similarity) * 100

    return uniqueness, similarity_ranking

In [11]:
# Define directory for training documents
corpus_dir = "./corpus"
os.makedirs(corpus_dir, exist_ok=True)

# Sample text content for random document generation
sample_sentences = [
    "Artificial intelligence is transforming the world in various domains.",
    "Machine learning algorithms can detect patterns in data efficiently.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning models are inspired by the structure and function of the brain.",
    "Big data analytics provides insights into complex systems and trends.",
    "Computer vision allows machines to interpret and process visual data.",
    "Cloud computing offers scalable resources for software development.",
    "Cybersecurity measures are crucial in protecting sensitive information.",
    "Robotics combines mechanical engineering and artificial intelligence.",
    "Blockchain technology ensures transparency and security in transactions."
]

def generate_random_document(num_sentences=5):
    """Generate a random document with a specified number of sentences."""
    return " ".join(random.choices(sample_sentences, k=num_sentences))

# Generate random documents
for i in range(1, 6):  # Create 5 random documents
    document_content = generate_random_document()
    file_path = os.path.join(corpus_dir, f"doc{i}.txt")
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(document_content)

print(f"Random documents have been generated in the '{corpus_dir}' directory.")


Random documents have been generated in the './corpus' directory.


In [18]:

def main():

    print("hello world")

    # Define paths
    corpus_dir = "./corpus"  # Directory containing training documents
    test_file = "./tests.txt"  # Test document path

    # Build index
    print("Building index from training corpus...")
    tfidf_matrix, vectorizer, doc_names = build_index(corpus_dir)

    # Load and preprocess test document
    with open(test_file, 'r', encoding='utf-8') as file:
        test_document = file.read()

    # Perform plagiarism check
    uniqueness, similarity_ranking = plagiarism_check(test_document, tfidf_matrix, vectorizer, doc_names)

    # Output results
    print(f"Uniqueness of the test document: {uniqueness:.2f}%")
    print("\nSimilarity ranking:")
    for rank, (doc_name, score) in enumerate(similarity_ranking, 1):
        print(f"{rank}. {doc_name}: Similarity Score = {score:.4f}")

if __name__ == "__main__":
    main()


hello world
Building index from training corpus...
Uniqueness of the test document: -0.00%

Similarity ranking:
1. doc1.txt: Similarity Score = 1.0000
2. doc3.txt: Similarity Score = 0.3878
3. doc2.txt: Similarity Score = 0.3502
4. doc4.txt: Similarity Score = 0.3342
5. doc5.txt: Similarity Score = 0.3042
