In [12]:
import nltk
nltk.download('popular')
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [13]:
data=pd.read_csv('/content/dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1


In [14]:
data.value_counts('label')

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,187
1,183


In [15]:
#clean text
def preprocess_text(text):
  #removing punctuation
  text=text.translate(str.maketrans('','',string.punctuation))
  #conver to lower case
  text=text.lower()
  #removing stopwords
  text=[word for word in text.split() if word not in stopwords.words('english')]
  return " ".join(text)

preprocess_text('This is a test for /?/? dummy text,')

'test dummy text'

In [16]:

data['source_text']=data['source_text'].apply(preprocess_text)
data['plagiarized_text']=data['plagiarized_text'].apply(preprocess_text)

In [17]:
#vectorizer
tfidf_vectorizer=TfidfVectorizer()
X=tfidf_vectorizer.fit_transform(data['source_text']+ " " + data['plagiarized_text'])
Y=data['label']

In [18]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)


In [22]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score,classification_report

print('accuracy:',accuracy_score(Y_test,Y_pred))
print('classification report:',classification_report(Y_test,Y_pred))
print('confusion:',confusion_matrix(Y_test,Y_pred))

accuracy: 0.8243243243243243
classification report:               precision    recall  f1-score   support

           0       0.79      0.86      0.82        35
           1       0.86      0.79      0.83        39

    accuracy                           0.82        74
   macro avg       0.83      0.83      0.82        74
weighted avg       0.83      0.82      0.82        74

confusion: [[30  5]
 [ 8 31]]


In [24]:
#using random forest classifier
from sklearn.ensemble import RandomForestClassifier

model=RandomForestClassifier(n_estimators=100,random_state=42)
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score,classification_report

print('accuracy:',accuracy_score(Y_test,Y_pred))
print('classification report:',classification_report(Y_test,Y_pred))
print('confusion:',confusion_matrix(Y_test,Y_pred))

accuracy: 0.7972972972972973
classification report:               precision    recall  f1-score   support

           0       0.71      0.97      0.82        35
           1       0.96      0.64      0.77        39

    accuracy                           0.80        74
   macro avg       0.83      0.81      0.79        74
weighted avg       0.84      0.80      0.79        74

confusion: [[34  1]
 [14 25]]


In [26]:
#using naive bayes
from sklearn.naive_bayes import MultinomialNB

model=MultinomialNB()
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score,classification_report

print('accuracy:',accuracy_score(Y_test,Y_pred))
print('classification report:',classification_report(Y_test,Y_pred))
print('confusion:',confusion_matrix(Y_test,Y_pred))

accuracy: 0.8648648648648649
classification report:               precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.87      0.87      0.87        39

    accuracy                           0.86        74
   macro avg       0.86      0.86      0.86        74
weighted avg       0.86      0.86      0.86        74

confusion: [[30  5]
 [ 5 34]]


In [27]:
#using svm

from sklearn.svm import SVC

model=SVC()
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score,classification_report

print('accuracy:',accuracy_score(Y_test,Y_pred))
print('classification report:',classification_report(Y_test,Y_pred))
print('confusion:',confusion_matrix(Y_test,Y_pred))

accuracy: 0.8378378378378378
classification report:               precision    recall  f1-score   support

           0       0.79      0.89      0.84        35
           1       0.89      0.79      0.84        39

    accuracy                           0.84        74
   macro avg       0.84      0.84      0.84        74
weighted avg       0.84      0.84      0.84        74

confusion: [[31  4]
 [ 8 31]]


In [28]:
#saving model and vectorizer
import pickle
pickle.dump(model,open('model.pkl','wb'))
pickle.dump(tfidf_vectorizer,open('tfidf_vectorizer.pkl','wb'))

In [32]:
#loading model and vectorizer
import pickle
import pickle

# Load the model
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

# Load the vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)


In [36]:
#detection system
def detect(input_text):
  input_text=preprocess_text(input_text)
  input_vector=tfidf_vectorizer.transform([input_text])
  prediction=model.predict(input_vector)
  return 'Plagiarism detected' if prediction[0]==1 else 'Plagiarism not detected'



In [37]:
#input
input_text= "	Researchers have discovered a new species of butterfly in the Amazon rainforest."
detect(input_text)

'Plagiarism detected'

In [38]:
#input
input_text= "	Data science often relies on Python as a widely used programming language."
detect(input_text)

'Plagiarism not detected'