<a href="https://colab.research.google.com/github/rijulvohra04/File-sharing-system/blob/main/Plagiarism_Checker_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pandas numpy scikit-learn nltk



In [None]:
from google.colab import files
uploaded = files.upload()

Saving mit-plagairism-detection-dataset.zip to mit-plagairism-detection-dataset.zip


In [None]:
!unzip mit-plagiarism-detection-dataset.zip

unzip:  cannot find or open mit-plagiarism-detection-dataset.zip, mit-plagiarism-detection-dataset.zip.zip or mit-plagiarism-detection-dataset.zip.ZIP.


In [4]:
import pandas as pd

# Path to the dataset
dataset_path = r"/content/mit-plagairism-detection-dataset.zip"

df = pd.read_csv(dataset_path, sep='\t', header=0)
df.columns = ['label', 'source_text', 'suspicious_text']
print(df.head())

                                               label  \
0  A person on a horse jumps over a broken down a...   
1              Children smiling and waving at camera   
2              Children smiling and waving at camera   
3  A boy is jumping on skateboard in the middle o...   
4  A boy is jumping on skateboard in the middle o...   

                           source_text  suspicious_text  
0    A person is outdoors, on a horse.                1  
1           There are children present                1  
2                The kids are frowning                0  
3    The boy skates down the sidewalk.                0  
4  The boy does a skateboarding trick.                1  


In [5]:
df = df.dropna(subset=['source_text', 'suspicious_text'])

In [6]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

# Define a function to preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    else:
        return ''

# Apply preprocessing to source and suspicious texts
df['source_text'] = df['source_text'].apply(preprocess_text)
df['suspicious_text'] = df['suspicious_text'].apply(preprocess_text)

# Display the preprocessed data
print(df[['source_text', 'suspicious_text']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


               source_text suspicious_text
0    person outdoors horse                
1         children present                
2            kids frowning                
3      boy skates sidewalk                
4  boy skateboarding trick                


In [7]:
# Check the columns in the DataFrame
print("Columns in df:", df.columns)

Columns in df: Index(['label', 'source_text', 'suspicious_text'], dtype='object')


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix

tfidf = TfidfVectorizer(max_features=5000)
combined_corpus = df['source_text'].tolist() + df['suspicious_text'].tolist()
tfidf.fit(combined_corpus)

source_tfidf = tfidf.transform(df['source_text'])
suspicious_tfidf = tfidf.transform(df['suspicious_text'])

dot_product = (source_tfidf.multiply(suspicious_tfidf)).sum(axis=1)

source_norm = np.sqrt((source_tfidf.multiply(source_tfidf)).sum(axis=1))
suspicious_norm = np.sqrt((suspicious_tfidf.multiply(suspicious_tfidf)).sum(axis=1))

epsilon = 1e-10
source_norm = np.array(source_norm).flatten()
suspicious_norm = np.array(suspicious_norm).flatten()

similarities = dot_product / (source_norm * suspicious_norm + epsilon)[:, None]

# Convert to a 1D array and handle any potential NaN values
similarities = np.array(similarities).flatten()
similarities = np.nan_to_num(similarities, 0.0)

df['similarity'] = similarities
print(df[['source_text', 'suspicious_text', 'similarity', 'label']].head())

               source_text suspicious_text  similarity  \
0    person outdoors horse                         0.0   
1         children present                         0.0   
2            kids frowning                         0.0   
3      boy skates sidewalk                         0.0   
4  boy skateboarding trick                         0.0   

                                               label  
0  A person on a horse jumps over a broken down a...  
1              Children smiling and waving at camera  
2              Children smiling and waving at camera  
3  A boy is jumping on skateboard in the middle o...  
4  A boy is jumping on skateboard in the middle o...  


In [9]:
print("Columns in df after adding similarity:", df.columns)
print(df[['source_text', 'suspicious_text', 'similarity', 'label']].head())

Columns in df after adding similarity: Index(['label', 'source_text', 'suspicious_text', 'similarity'], dtype='object')
               source_text suspicious_text  similarity  \
0    person outdoors horse                         0.0   
1         children present                         0.0   
2            kids frowning                         0.0   
3      boy skates sidewalk                         0.0   
4  boy skateboarding trick                         0.0   

                                               label  
0  A person on a horse jumps over a broken down a...  
1              Children smiling and waving at camera  
2              Children smiling and waving at camera  
3  A boy is jumping on skateboard in the middle o...  
4  A boy is jumping on skateboard in the middle o...  


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = df[['similarity']]
y = df['label']  # Labels (1 for plagiarism, 0 for non-plagiarism)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
# Training accuracy
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Testing accuracy
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy: {test_accuracy:.2f}")