Group No - 43 \
Topic - Plagiarism Checker

# Experiment No - 03

In [None]:
! pip install pyspellchecker
! pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## importing libraries

In [None]:
import pandas as pd
import re
import unicodedata
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from spellchecker import SpellChecker
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

## preparing dataset

In [None]:
df = pd.read_csv('drive/MyDrive/datasets/articles1.csv', nrows=587)

df=df[['title', 'content']]
df.rename({'content':'article'}, axis=1, inplace=True)
df.head()

Unnamed: 0,title,article
0,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...
1,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood..."
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri..."
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t..."
4,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ..."


## preprocessing data

In [None]:
from drive.MyDrive.datasets.contradictions import contractions_dict
# helper functions

def expand_contradiction(article):
  contradiction_re = re.compile('|'.join(contractions_dict.keys()))       # making regex pattern
  replace = lambda match: contractions_dict[match.group(0)]               # function to return matched value for contradiction
  return contradiction_re.sub(replace, article)                           # replacing the contradictions


def remove_punctuation(article):
  punctuation = string.punctuation + '\n' + '“' + '”' + '—' + '’'
  return article.translate(str.maketrans('', '', punctuation))


def remove_emoji(article):
  emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', article)


def correct_spellings(article):
  spell = SpellChecker()
  words = article.split()
  corrected_words = []
  misspelled_words = spell.unknown(words)

  for word in words:
    if word in misspelled_words:
      corrected_words.append(str(spell.correction(word)))
    else:
      corrected_words.append(str(word))
  return ' '.join(corrected_words)


STOPWORDS  = set(stopwords.words('english'))

In [None]:
def preprocessing(articles):
  articles = articles.str.lower()
  articles = articles.apply(lambda x: re.sub('<.*?>', '', x))
  articles=articles.apply(lambda x: re.sub(r'\S+@\S+', '', x))
  articles=articles.apply(lambda x: re.sub(r'https?://\S+ | www\.\S+', '', x))
  articles = articles.apply(lambda x: expand_contradiction(x))
  articles = articles.apply(remove_punctuation)
  articles = articles.apply(remove_emoji)
  articles = articles.apply(lambda x: re.sub(' +', ' ', x).strip())
  articles = articles.apply(lambda x: ' '.join(word for word in x.split() if word not in STOPWORDS))
  return articles


# Stemming
def stem_words(article):
  stemmer = PorterStemmer()
  token_words = word_tokenize(article)   
  return ' '.join([stemmer.stem(word) for word in token_words])


# Lemmatization
def lemmatize_words(article):
  lemmatizer = WordNetLemmatizer()
  token_words = word_tokenize(article)
  result = []
  for word,pos in nltk.pos_tag(token_words):
    pos = pos[0].lower()
    pos = 'n' if pos not in list('arnv') else pos
    result.append(lemmatizer.lemmatize(word, pos))
  return ' '.join(result)

In [None]:
df['article_preprocessed'] = preprocessing(df.article)
df['article_stemmed'] = df['article_preprocessed'].apply(stem_words)
df['article_lemmatized'] = df['article_preprocessed'].apply(lemmatize_words)

df.to_csv('drive/MyDrive/datasets/articles.csv', index=False)
df.sample(5)

Unnamed: 0,title,article,article_preprocessed,article_stemmed,article_lemmatized
485,"Trump Inspires Cheers, and Alarm, Around the W...","SOFIA, Bulgaria — The inaugural meeting of ...",sofia bulgaria inaugural meeting european trum...,sofia bulgaria inaugur meet european trump soc...,sofia bulgaria inaugural meet european trump s...
279,"As Trump Berates News Media, a New Strategy Is...","Well, that sure escalated quickly. “That” was ...",well sure escalated quickly donald j trumps in...,well sure escal quickli donald j trump inaugur...,well sure escalate quickly donald j trump inau...
252,6 Volkswagen Executives Charged as Company Ple...,WASHINGTON — Federal prosecutors announced ...,washington federal prosecutors announced crimi...,washington feder prosecutor announc crimin cha...,washington federal prosecutor announce crimina...
295,Should Ivanka Trump the Woman Wear Ivanka Trum...,A few hours after her father’s news conference...,hours fathers news conference wednesday trump ...,hour father news confer wednesday trump tower ...,hour father news conference wednesday trump to...
90,Trump Said ‘Torture Works.’ An Echo Is Feared ...,When the United Nations’ top official tried ...,united nations top official tried inspect infa...,unit nation top offici tri inspect infam priso...,united nation top official tried inspect infam...


In [None]:
articles = df['article_preprocessed']
new_article = "mother obliged travel false identities state affairs known among chinese immigrants paper son hope circumventing chinese exclusion act 1882 signed law president chester arthur act drastically curtailed number chinese people allowed enter country among earliest united states laws impose severe restrictions immigration 1906 unforeseen loophole opened form san francisco earthquake fire huge number municipal documents including birth immigration records destroyed many newly arrived chinese capitalized loss maintaining born san francisco fire united states citizens entitled bring relatives case gen yeo father paper sons posing relatives attuned deception united states immigration officials put chinese arrivals formidable inquisition ensure claimed questions came like gunfire direction village face many windows house house rice bin wide well deep trees village lakes shops name sponsoring relative interrogated separately answers match new arrival major mistake series smaller ones could mean deportation stand chance passing aspirants memorized rigorous dossiers known coaching papers ensuing interrogation hard enough adults gen yeo would undergo alone dec 30 1920 month sea wongs landed angel island immigration station elder mr wong traveling merchant named look get son look tai yow angel island considered ellis island west coast lisa see author gold mountain 1995 nonfiction chronicle family said interview 2016 however continued goal really different ellis island supposed welcoming angel island opened specifically keep chinese mr wongs father previously lived united states look get able clear immigration quickly new arrival gen yeo detained island nearly month child among immigrants held scared half death cried mr wong recalled tyrus documentary directed pamela tom premiered 2015 every day miserable miserable hated place jan 27 1921 presence interpreter stenographer young gen yeo posing look tai yow interrogated three inspectors father already questioned gen yeo well prepared answered without error sacramento joined father schoolteacher americanized tai yow tyrus known tyrus wong ever soon afterward father son separated elder mr wong moved los angeles seek work reasons lost time could give son tyrus lived sacramento boardinghouse attending elementary school two years later possibly tyrus traveled los angeles join father found work"

## Plagiarism Checker using N-Gram Model

In [None]:
import re
from nltk.util import ngrams, pad_sequence, everygrams
from nltk.tokenize import word_tokenize
from nltk.lm import MLE, WittenBellInterpolated
import numpy as np
import plotly.graph_objects as go
from scipy.ndimage import gaussian_filter

def get_ngram_score(article, new_article = new_article):
  train_text = article

  # set ngram number
  n = 4

  # pad the text and tokenize
  training_data = list(pad_sequence(word_tokenize(train_text), n, 
                                    pad_left=True, 
                                    left_pad_symbol="<s>"))

  # generate ngrams
  ngrams = list(everygrams(training_data, max_len=n))
  # print("Number of ngrams:", len(ngrams))

  # build ngram language models
  model = WittenBellInterpolated(n)
  model.fit([ngrams], vocabulary_text=training_data)
  # print(model.vocab)


  test_text = new_article

  # Tokenize and pad the text
  testing_data = list(pad_sequence(word_tokenize(test_text), n, 
                                  pad_left=True,
                                  left_pad_symbol="<s>"))
  # print("Length of test data:", len(testing_data))

  # assign scores
  scores = []
  for i, item in enumerate(testing_data[n-1:]):
      s = model.score(item, testing_data[i:i+n-1])
      scores.append(s)

  scores_np = np.array(scores)
  # print(max(scores_np))
  return scores_np

In [None]:
max_ngram_score, article_idx = -1, -1

for i in range(len(articles)):
  score = max(get_ngram_score(articles[i]))

  if score>max_ngram_score:
    max_ngram_score = score
    article_idx = i

print("Ngram score :", max_ngram_score)
print("Plagiarised from article no : ", article_idx)

Ngram score : 0.9723873204556711
Plagiarised from article no :  2


In [None]:
train_text = articles[article_idx]

# set ngram number
n = 4

# pad the text and tokenize
training_data = list(pad_sequence(word_tokenize(train_text), n, 
                                  pad_left=True, 
                                  left_pad_symbol="<s>"))

# generate ngrams
ngrams = list(everygrams(training_data, max_len=n))
print("Number of ngrams:", len(ngrams))

# build ngram language models
model = WittenBellInterpolated(n)
model.fit([ngrams], vocabulary_text=training_data)
print(model.vocab)


test_text = new_article

# Tokenize and pad the text
testing_data = list(pad_sequence(word_tokenize(test_text), n, 
                                pad_left=True,
                                left_pad_symbol="<s>"))
print("Length of test data:", len(testing_data))

# assign scores
scores = []
for i, item in enumerate(testing_data[n-1:]):
    s = model.score(item, testing_data[i:i+n-1])
    scores.append(s)

scores_np = np.array(scores)

# set width and height
width = 8
height = np.ceil(len(testing_data)/width).astype("int32")
print("Width, Height:", width, ",", height)

# copy scores to rectangular blank array
a = np.zeros(width*height)
a[:len(scores_np)] = scores_np
diff = len(a) - len(scores_np)

# apply gaussian smoothing for aesthetics
a = gaussian_filter(a, sigma=1.0)

# reshape to fit rectangle
a = a.reshape(-1, width)

# format labels
labels = [" ".join(testing_data[i:i+width]) for i in range(n-1, len(testing_data), width)]
labels_individual = [x.split() for x in labels]
labels_individual[-1] += [""]*diff
labels = [f"{x:60.60}" for x in labels]

# create heatmap
fig = go.Figure(data=go.Heatmap(
                z=a, x0=0, dx=1,
                y=labels, zmin=0, zmax=1,
                customdata=labels_individual,
                hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
                colorscale="burg"))
fig.update_layout({"height":height*28, "width":1000, "font":{"family":"Courier New"}})
fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()

Number of ngrams: 5378
<Vocabulary with cutoff=1 unk_label='<UNK>' and 831 items>
Length of test data: 336
Width, Height: 8 , 42





## Plagiarism Checker using TF-IDF Model

In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import MiniBatchKMeans


# tfidf calculation
text_content = df['article_preprocessed']
vector = TfidfVectorizer(max_df=0.3,         # drop words that occur in more than X percent of documents
                             #min_df=8,      # only use words that appear at least X times
                             stop_words='english', # remove stop words
                             lowercase=True, # Convert everything to lower case 
                             use_idf=True,   # Use idf
                             norm=u'l2',     # Normalization
                             smooth_idf=True # Prevents divide-by-zero errors
                            )
tfidf = vector.fit_transform(text_content)


# Request function : search the top_n articles from a request ( request = string)
def search(tfidf_matrix,model,request, top_n = 5):
    request_transform = model.transform([request])
    similarity = np.dot(request_transform,np.transpose(tfidf_matrix))
    x = np.array(similarity.toarray()[0])
    indices=np.argsort(x)[-5:][::-1]
    return indices

# Find similar : get the top_n articles similar to an article 
# def find_similar(tfidf_matrix, index, top_n = 5):
#     cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
#     related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
#     return [index for index in related_docs_indices][0:top_n]    

# Print the result
def print_result(request_content,indices,X):
    print('\nsearch : ' + request_content)
    print('\nBest Results :')
    for i in indices:
        print('id = {0:5d} - title = {1}'.format(i,df['title'].loc[i]))

In [None]:
result = search(tfidf,vector, new_article, top_n = 5)
print_result(new_article,result,df)


search : mother obliged travel false identities state affairs known among chinese immigrants paper son hope circumventing chinese exclusion act 1882 signed law president chester arthur act drastically curtailed number chinese people allowed enter country among earliest united states laws impose severe restrictions immigration 1906 unforeseen loophole opened form san francisco earthquake fire huge number municipal documents including birth immigration records destroyed many newly arrived chinese capitalized loss maintaining born san francisco fire united states citizens entitled bring relatives case gen yeo father paper sons posing relatives attuned deception united states immigration officials put chinese arrivals formidable inquisition ensure claimed questions came like gunfire direction village face many windows house house rice bin wide well deep trees village lakes shops name sponsoring relative interrogated separately answers match new arrival major mistake series smaller ones co

## Plagiarism Checker using Cosine Similarity

In [None]:
def cosine_similarity(X, Y):
  X_set = set(word_tokenize(X))
  Y_set = set(word_tokenize(Y))

  l1 =[];l2 =[]
  c = 0

  # form a set containing keywords of both strings 
  rvector = X_set.union(Y_set) 
  for w in rvector:
      if w in X_set: l1.append(1) 
      else: l1.append(0)
      if w in Y_set: l2.append(1)
      else: l2.append(0)
    
  # cosine formula 
  for i in range(len(rvector)):
          c+= l1[i]*l2[i]
  cosine = c / float((sum(l1)*sum(l2))**0.5)

  return cosine

In [None]:
max_cos_score, idx = -1, -1

for i in range(len(articles)):
  score = cosine_similarity(articles[i], new_article)
  if score>max_cos_score:
    max_cos_score = score
    idx = i

print("Cosine Similarity score :", max_cos_score)
print("Plagiarised from article no : ", idx)

Cosine Similarity score : 0.5414140464019368
Plagiarised from article no :  2
