In [1]:
import constants
from IPython.display import Markdown, display
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from utils import (
    get_semantic_scholar_id,
    get_pdfs,
    load_data,
    split_text_to_chunks,
    get_paragraphs,
    get_titles_abstracts,
    get_embeddings,
    get_google_results,
    generate_answer,
    get_tldr,
    create_context_chatgpt,
    answer_question_chatgpt,
    get_semantic_scholar_pdf_id,
    controller_id_function,
    get_cosine_similarity,
    get_paper_info
)
import openai
from urllib.parse import urlparse
import requests
from tqdm.notebook import tqdm

# tqdm pandas progress
tqdm.pandas()

In [2]:
K = 10
query = "do language models plagiarize?"

In [6]:

def printmd(string):
    display(Markdown(string))

In [3]:
def extract_paper_ids(df):
    df["paperId"] = df["link"].apply(controller_id_function)
    return df


def add_paper_info(df):
    df["paperInfo"] = df["paperId"].apply(get_paper_info)
    return df


def parse_paper_info(df):
    df["title"] = df["paperInfo"].apply(lambda x: x["title"])
    df["abstract"] = df["paperInfo"].apply(lambda x: x["abstract"])
    df["venue"] = df["paperInfo"].apply(lambda x: x["venue"])
    df["year"] = df["paperInfo"].apply(lambda x: x["year"])
    df["openAccessPdf"] = df["paperInfo"].apply(lambda x: x["openAccessPdf"])
    df = df[["title", "abstract", "venue", "year", "openAccessPdf", "paperId"]]
    return df


def add_pdf_paths(df, folder):
    df["pdf_paths"] = df["paperId"].apply(lambda x: folder + x + ".pdf")
    return df


def read_pdfs(df):
    print("Reading pdfs...")
    df["text"] = df["pdf_paths"].apply(lambda x: load_data(x))
    return df


def preprocess_text(df):
    df["paragraphs"] = df["text"].apply(
        lambda x: split_text_to_chunks(x) if not pd.isna(x) else None
    )
    return df


def get_all_text(df):
    paragraphs = get_paragraphs(df)
    titles_abstracts = get_titles_abstracts(df)
    all_text = paragraphs + titles_abstracts
    return all_text


def create_text_df(all_text):
    df_text = pd.DataFrame(all_text, columns=["paperId", "text"])
    return df_text


def add_embeddings(df_text):
    df_text["embedding"] = df_text["text"].apply(lambda x: get_embeddings(x))
    return df_text


def get_cosine_similarities(df_text, query_embedding):
    df_text["cosine_similarity"] = df_text["embedding"].apply(
        lambda x: get_cosine_similarity(query_embedding, x)
    )
    return df_text


def sort_by_cosine_similarity(df_text):
    df_text = df_text.sort_values(by=["cosine_similarity"], ascending=False)
    return df_text


def get_top_results(df_text, K):
    df_text = df_text.head(K)
    return df_text


def add_tldrs(df_text):
    df_text["tldr"] = df_text["text"].apply(lambda x: get_tldr(x))
    return df_text


def main(query, K=10):
    df = get_google_results(query)
    df = extract_paper_ids(df)
    df = add_paper_info(df)
    df = parse_paper_info(df)
    get_pdfs(df, "pdfs/")
    df = add_pdf_paths(df, "pdfs/")
    df = read_pdfs(df)
    df = preprocess_text(df)
    all_text = get_all_text(df)
    df_text = create_text_df(all_text)
    df_text = add_embeddings(df_text)
    query_embedding = get_embeddings(query)
    df_text = get_cosine_similarities(df_text, query_embedding)
    df_text = sort_by_cosine_similarity(df_text)
    df_text = get_top_results(df_text, K)
    df_text = add_tldrs(df_text)
    return df_text


In [4]:
df_text = main(query, K)

  0%|          | 0/10 [00:00<?, ?it/s]

Reading pdfs...


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [10]:

def answer_question_chatgpt(
    df,
    question="What is the impact of creatine on cognition?",
    k=5,
    instructions="Instructions: Using the provided web search results, write a comprehensive reply to the given query. If you find a result relevant definitely make sure to cite the result using [[number](URL)] notation after the reference. End your answer with a summary. A\nQuery:",
    max_len=3000,
    debug=False,
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context_chatgpt(question, df, k=k)

    try:
        # Create a completions using the question and context
        # prompt = f'''{context} \n\n Instructions: Using the provided literature with sources, write a comprehensive reply to the given query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject. You can skip a citation which you dont find relevant to the query. \nQuery:{question}\nAnswer:'''
        prompt = f"""{context} \n\n{instructions} {question}\nAnswer:"""
        return prompt
    except Exception as e:
        print(e)
        return ""


def create_context_chatgpt(question, df, k=5):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    returns = []
    count = 1
    # Sort by distance and add the text to the context until the context is too long
    for i, row in df[:k].iterrows():

        # Else add it to the text that is being returned
        returns.append(
            "["
            + str(count)
            + "] "
            + row["tldr"]
            + "\nURL: "
            + "https://www.semanticscholar.org/paper/"
            + row["paperId"]
        )
        count += 1
    # Return the context
    return "\n\n".join(returns)



In [13]:
df_text.head()

Unnamed: 0,paperId,text,embedding,cosine_similarity,tldr
59,79bf58971512598569f3a0113b520a33e5176696,Towards the Exploitation of Statistical Langua...,"[[-0.2042403221130371, 2.177624464035034, 0.12...",0.86597,"In this work, we describe our first attempt to..."
58,fba934b8955022742636b211bf07fc1aada74ed4,Towards the Exploitation of Statistical Langua...,"[[-0.2042403221130371, 2.177624464035034, 0.12...",0.86597,"In this work, we describe our first attempt to..."
33,0c97903e7d85c05bcd2a7e26fc2a4a47998f5dde,##t - 3. it has learned to code ( and blog and...,"[[-0.5793511867523193, 1.6754060983657837, -0....",0.847437,A new study has found that the use of artifici...
15,64e994b6bd61ab6be34543f42bd043b8ebdde22f,", n., schubotz, m., gipp, b. : analyzing seman...","[[-0.27326077222824097, 1.6272714138031006, 0....",0.84691,A hybrid approach to academic plagiarism detec...
13,64e994b6bd61ab6be34543f42bd043b8ebdde22f,"6 ), 112 : 1 { 112 : 42 ( 2019 ). https : / / ...","[[-0.39709630608558655, 2.05199933052063, 0.53...",0.845356,A cross-language plagiarism detection system b...


In [20]:
#  column paperId has duplicate values, I want to assign a unique integer to each row based on the column paperId 

def unique_id(df):
    df['unique_id'] = df['paperId'].astype('category').cat.codes
    return df

unique_id(df_text)

Unnamed: 0,paperId,text,embedding,cosine_similarity,tldr,unique_index,id,unique_id
59,79bf58971512598569f3a0113b520a33e5176696,Towards the Exploitation of Statistical Langua...,"[[-0.2042403221130371, 2.177624464035034, 0.12...",0.86597,"In this work, we describe our first attempt to...",79bf58971512598569f3a0113b520a33e5176696914,79bf58971512598569f3a0113b520a33e5176696,3
58,fba934b8955022742636b211bf07fc1aada74ed4,Towards the Exploitation of Statistical Langua...,"[[-0.2042403221130371, 2.177624464035034, 0.12...",0.86597,"In this work, we describe our first attempt to...",fba934b8955022742636b211bf07fc1aada74ed4682,fba934b8955022742636b211bf07fc1aada74ed4,5
33,0c97903e7d85c05bcd2a7e26fc2a4a47998f5dde,##t - 3. it has learned to code ( and blog and...,"[[-0.5793511867523193, 1.6754060983657837, -0....",0.847437,A new study has found that the use of artifici...,0c97903e7d85c05bcd2a7e26fc2a4a47998f5dde255,0c97903e7d85c05bcd2a7e26fc2a4a47998f5dde,0
15,64e994b6bd61ab6be34543f42bd043b8ebdde22f,", n., schubotz, m., gipp, b. : analyzing seman...","[[-0.27326077222824097, 1.6272714138031006, 0....",0.84691,A hybrid approach to academic plagiarism detec...,64e994b6bd61ab6be34543f42bd043b8ebdde22f148,64e994b6bd61ab6be34543f42bd043b8ebdde22f,2
13,64e994b6bd61ab6be34543f42bd043b8ebdde22f,"6 ), 112 : 1 { 112 : 42 ( 2019 ). https : / / ...","[[-0.39709630608558655, 2.05199933052063, 0.53...",0.845356,A cross-language plagiarism detection system b...,64e994b6bd61ab6be34543f42bd043b8ebdde22f686,64e994b6bd61ab6be34543f42bd043b8ebdde22f,2
9,64e994b6bd61ab6be34543f42bd043b8ebdde22f,"provides over - current pds, we compared it to...","[[-0.2686731815338135, 1.6583670377731323, 0.4...",0.844149,Researchers have developed a machine-to-machin...,64e994b6bd61ab6be34543f42bd043b8ebdde22f518,64e994b6bd61ab6be34543f42bd043b8ebdde22f,2
11,64e994b6bd61ab6be34543f42bd043b8ebdde22f,( 1991 ). https : / / doi. org / 10. 1007 / bf...,"[[-0.09546034783124924, 1.5630769729614258, -0...",0.843557,A new algorithm for automatic plagiarism detec...,64e994b6bd61ab6be34543f42bd043b8ebdde22f409,64e994b6bd61ab6be34543f42bd043b8ebdde22f,2
0,64e994b6bd61ab6be34543f42bd043b8ebdde22f,[CLS] detecting machine - obfuscated plagiaris...,"[[0.23195765912532806, 1.7406513690948486, 0.4...",0.835629,Researchers at the University of Wuppertal and...,64e994b6bd61ab6be34543f42bd043b8ebdde22f538,64e994b6bd61ab6be34543f42bd043b8ebdde22f,2
42,930f526de3b0fc3b7d9fab1bd56f36272ca86b17,##® ( n. d. ). the free online automated parap...,"[[-0.17010152339935303, 1.6247491836547852, 0....",0.832636,Free online paraphrasing tools rely principall...,930f526de3b0fc3b7d9fab1bd56f36272ca86b17671,930f526de3b0fc3b7d9fab1bd56f36272ca86b17,4
62,1936f64ef3d2a4e2e1dc89b40f3f40aecdeaf69b,Using Word Embedding for Cross-Language Plagia...,"[[0.3351580500602722, 1.5683748722076416, 0.88...",0.831746,This paper proposes to use distributed represe...,1936f64ef3d2a4e2e1dc89b40f3f40aecdeaf69b55,1936f64ef3d2a4e2e1dc89b40f3f40aecdeaf69b,1


In [11]:
prompt = answer_question_chatgpt(
    df_text,
    question=query,
    k=K,
    instructions="Instructions: Using the provided web search results, write a comprehensive reply to the given query. If you find a result relevant definitely make sure to cite the result using [[number](URL)] notation after the reference. End your answer with a summary. A\nQuery:",
)
printmd(prompt)

[1] In this work, we describe our first attempt to detect plagiarised segments in a text employing statistical Language Models (LMs) and perplexity. The preliminary experiments carried out on two specialised and literary corpora (including original, part-of-speech and stemmed versions) show that perplexity of a text segment, given a Language Model calculated over an author text, could be a relevant feature in plagiarism detection.
URL: https://www.semanticscholar.org/paper/79bf58971512598569f3a0113b520a33e5176696

[2] In this work, we describe our first attempt to detect plagiarised segments in a text employing statistical Language Models (LMs) and perplexity. The preliminary experiments carried out on two specialised and literary corpora (including original, part-of-speech and stemmed versions) show that perplexity of a text segment, given a Language Model calculated over an author text, could be a relevant feature in plagiarism detection.
URL: https://www.semanticscholar.org/paper/fba934b8955022742636b211bf07fc1aada74ed4

[3] A new study has found that the use of artificial intelligence (AI) in data-driven artificial intelligence systems has led to bias in data-driven AI systems. The study found that the use of AI in data-driven artificial intelligence systems has led to bias in data-driven AI systems. It also found that the use of AI in data-driven artificial intelligence systems has led to bias in data-driven AI systems.
URL: https://www.semanticscholar.org/paper/0c97903e7d85c05bcd2a7e26fc2a4a47998f5dde

[4] A hybrid approach to academic plagiarism detection was presented at the 17th acm / ieee joint conference on digi- tal libraries ( jcdl ). In this paper, a hybrid approach to academic plagiarism detection was presented. The hybrid approach uses semantic concept patterns to detect academic plagiarism. It is based on the concept of semantic similarity detection based graph approach.
URL: https://www.semanticscholar.org/paper/64e994b6bd61ab6be34543f42bd043b8ebdde22f

[5] A cross-language plagiarism detection system based on a knowledge graph and based on explicit semantic analysis has been developed. The system can detect cross-language plagiarism over continuous space and knowledge graph based representations of language. It can also detect cross-language plagiarism over continuous space and knowledge graph based representations of language.
URL: https://www.semanticscholar.org/paper/64e994b6bd61ab6be34543f42bd043b8ebdde22f

[6] Researchers have developed a machine-to-machine (M2M) plagiarism detection system that can distinguish between original and machine-to-machine (M2M) paraphrased text. The system is based on a word2vec embedding model and anm classier. Researchers tested the system on ten machine-to-machine (M2M) paraphrased articles selected at random from a document test set. turnitin found the correct source in all cases.
URL: https://www.semanticscholar.org/paper/64e994b6bd61ab6be34543f42bd043b8ebdde22f

[7] A new algorithm for automatic plagiarism detection has been developed. The algorithm is based on a state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-art state-of-the-
URL: https://www.semanticscholar.org/paper/64e994b6bd61ab6be34543f42bd043b8ebdde22f

[8] Researchers at the University of Wuppertal and the University of Konstanz have developed a machine-to-machine (M2M) plagiarism detection system. The system uses word embedding models to detect machine-to-machine (M2M) plagiarism. The system is capable of detecting machine-to-machine (M2M) plagiarism tom a s folt ynek1 and tom a s folt ynek3 and tom a s folt ynek4 and tom a s folt ynek5 and tom a s folt ynek6.
URL: https://www.semanticscholar.org/paper/64e994b6bd61ab6be34543f42bd043b8ebdde22f

[9] Free online paraphrasing tools rely principally on synonym substitu - tion without the overall syntax of the sentence, resulting in language which is unidiomatic at best, incomprehensible at worst. When a sentence was taken from the assessment scenario, one day, while doug was out walking, he felt lightheaded and then lost consciousness and fell to the ground.
URL: https://www.semanticscholar.org/paper/930f526de3b0fc3b7d9fab1bd56f36272ca86b17

[10] This paper proposes to use distributed representation of words (word embeddings) in cross-language textual similarity detection. The main contributions of this paper are the following: (a) we introduce new cross-language similarity detection methods based on distributed representation of words; (b) we combine the different methods proposed to verify their complementarity and finally obtain an overall F1 score of 89.15% for English-French similarity detection at chunk level (88.5% at sentence level).
URL: https://www.semanticscholar.org/paper/1936f64ef3d2a4e2e1dc89b40f3f40aecdeaf69b 

Instructions: Using the provided web search results, write a comprehensive reply to the given query. If you find a result relevant definitely make sure to cite the result using [[number](URL)] notation after the reference. End your answer with a summary. A
Query: do language models plagiarize?
Answer:

In [7]:
response = generate_answer(prompt)
printmd(response)

Language models, especially statistical ones, may be applied to detect plagiarized segments in a text by employing perplexity, as demonstrated in a study that used two specialized and literary corpora [[1](https://www.semanticscholar.org/paper/79bf58971512598569f3a0113b520a33e5176696)][[2](https://www.semanticscholar.org/paper/fba934b8955022742636b211bf07fc1aada74ed4)]. Another paper describes a hybrid approach to academic plagiarism detection using semantic concept patterns and detecting plagiarism based on semantic similarity detection through graph approach [[4](https://www.semanticscholar.org/paper/64e994b6bd61ab6be34543f42bd043b8ebdde22f)]. Furthermore, researchers have developed a machine-to-machine (M2M) plagiarism detection system that applies a word2vec embedding model for distinguishing between original and machine-to-machine (M2M) paraphrased text [[6](https://www.semanticscholar.org/paper/64e994b6bd61ab6be34543f42bd043b8ebdde22f)]. On the other hand, free online paraphrasing tools often rely on synonym substitution without considering sentence syntax and may result in unidiomatic or incomprehensible language at times [[9](https://www.semanticscholar.org/paper/930f526de3b0fc3b7d9fab1bd56f36272ca86b17)].

In summary, while language models and related techniques may be utilized for plagiarism detection, there are instances where they might be less effective, such as with paraphrasing tools.