In [27]:
import pandas as pd
import numpy as np

import re
import PyPDF2
import json
import openai
import nltk
import random

## Data Generation

### Non-AI

In [40]:
data = pd.DataFrame(columns=["Source" ,"Text" ,"IsAI"])

#### Reading sentences through PDF file

In [78]:
np.random.seed(0)

nltk.download('punkt')

sentences_list = []

def random_sentences_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Getting the number of pages in the PDF.
        num_pages = len(pdf_reader.pages)

        # Iterating thorugh every page except a few at the begginning an the last one.
        for page_number in range(5, num_pages - 1):
            # Reading the page.
            page = pdf_reader.pages[page_number]
            
            # Extracting text from the selected page.
            page_text = page.extract_text()
            
            # Splitting the text into sentences.
            sentences = nltk.sent_tokenize(page_text)
            
            # Selecting a starting point.
            index = 2
            # Ensuring the page is long enough.
            if index < len(sentences) - 4:
                random_start = random.randint(index, len(sentences) - 4)  # Ensuring at least 3 sentences left with this minus 4.
                
                # Extract the next three sentences
                # I want my texts to be at least 2 and maximum 3 sentences long.
                sentence_count = random.randint(2, 3)

                # Selecting the sentences as text.
                selected_sentences = sentences[random_start:random_start + sentence_count]

                # Storing the text.
                sentences_list.append(''.join(selected_sentences))
        
# Extracting texts from provided PDF.
pdf_path = "The Da Vinci Code.pdf"
random_sentences_from_pdf(pdf_path)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [75]:
# Our 2 to 3 sentence long texts are stored here.
sentences_list

['Langdon still felt fuzzy.A visitor?',
 '"Yes?"As expected, it was the concierge.',
 'Langdon felt a sudden surge of uneasiness.He and the revered curator Jacques \nSaunière had been slated to meet for drinks after Langdon\'s lecture tonight, but Saunière \nhad never shown up."Yes.',
 'His bedroom door was \nopen; locks were forbidden here.He entered, closing the door behind him.The room was spartan—hardwood floors, a pine dresser, a canvas mat in the corner \nthat served as his bed.',
 "All true followers of \nThe Way wore this device—a leather strap, studded with sharp metal barbs that cut into \nthe flesh as a perpetual reminder of Christ's suffering.The pain caused by the device also \nhelped counteract the desires of the flesh.Although Silas already had worn his cilice today longer than the requisite two hours, \nhe knew today was no ordinary day.",
 "His books on the secret codes hidden in the paintings of \nPoussin and Teniers were some of Langdon's favorite classroom texts.Ton

In [79]:
# Creating the data as a DataFrame.
new_data = pd.DataFrame({
    'Source': "PDF",
    'Text': sentences_list,
    'IsAI': 'no'
})

pdf = pd.concat([data, new_data], ignore_index=True)

In [80]:
pdf

Unnamed: 0,Source,Text,IsAI
0,PDF,"""Would you like to hear \nmore?""The crowd appl...",no
1,PDF,"""My capitaine is waiting, sir.""Langdon barely ...",no
2,PDF,You must retrieve the stone for me.Immediately...,no
3,PDF,"How will I enter?""With the confident tone of a...",no
4,PDF,Again the image of the curator's body flashed ...,no
...,...,...,...
1825,PDF,"She sighed in mock exasperation.""Why is it tha...",no
1826,PDF,"Everything speaks of Rosslyn.""""Very well, let ...",no
1827,PDF,"""When can I see you again?""Langdon reeled mome...",no
1828,PDF,"Moving onto it, Langdon scanned \nthe surface ...",no


#### Reading data through Youtube comments

In [None]:
from googleapiclient.discovery import build # pip install google-api-python-client
from time import sleep

# Replace 'YOUR_API_KEY' with your actual API key
api_key = 'YOUR_API_KEY'

# Replace 'VIDEO_ID' with the ID of the video you want to retrieve comments from

youtube = build('youtube', 'v3', developerKey=api_key)

videos = [
    'SVgVzEVeP4Q', 'JvEas_zZ4fM', '_FGUkxn5kZQ', '9Y-YJEtxHeo', '-8xTVMtkqv4',
    'SLD9xzJ4oeU', 'cfVY9wLKltA', '6ZfuNTqbHE8', 'kBdfcR-8hEY', 'MBRqu0YOH14',
    'fLJBzhcSWTk', 'qjfaoe847qQ', 'fSQgCy_iIcc', 'R9OCA6UFE-0', 'YaDvRdLMkHs',
    'Yocja_N5s1I', 'B3u4EFTwprM', 'ylWORyToTo4', '7eh4d6sabA0', 'Uq1ANdWRIh0',
    'RCXGpEmFbOw', 'tMWJGs3CQ_Q', 'vrl5PFB35Ec', 'c-JkrlVhs_0', 'LhQOn9IOmeE',
    'WrzFMlX1-dU', 'QqLgmequ7Bk', '88aDJFdUjH4', '4NnJ-7Y6qwo', 'p6ff-ShY5Bw',
    '9biIOtEYeHc', 'GhUfIXRkUF8', 'Piw67Dl_VaM', 'O88Mm06G9ko', 'jNQXAC9IVRw',
    'lb13ynu3Iac', 'mwKJfNYwvm8', 'QjvpjXdgugA', 'WdCRrcfan44', 'RYfmRTyl56w',
    'Gf7s9pbOEpk', 'd_9ZsZ3foho', 'PHgc8Q6qTjc', 'By_Cn5ixYLg', 'nfWlot6h_JM',
    'e-ORhEE9VVg', 'QcIy9NiNbmo', 'CAs_aX95tVQ', '7SWvDHvWXok', 'Uj3_KqkI9Zo'
]

def get_comments(video_id, min_length=10, max_results=50):
    comments = []
    next_page_token = None

    while len(comments) < max_results:
        # Retrieve the comment threads
        results = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            textFormat="plainText",
            maxResults=100,  # Fetch more and filter
            pageToken=next_page_token
        ).execute()

        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            if len(comment.split()) >= min_length:
                comments.append(comment)
            if len(comments) >= max_results:
                break

        next_page_token = results.get('nextPageToken')
        if not next_page_token:
            break

    return comments[:max_results]

# Initialize an empty DataFrame
youtube_df = pd.DataFrame(columns=['Source', 'Text', 'IsAI'])

for video_id in videos:
    sleep(0.5) # To not exceed the quota
    video_comments = get_comments(video_id)
    temp_df = pd.DataFrame({'Source': 'YouTube - '+video_id, 'Text': video_comments, 'IsAI': 'no'})
    youtube_df = pd.concat([youtube_df, temp_df], ignore_index=True)

#### Concatenating the data

In [None]:
# Finalizing the human written data.
human = pd.concat([pdf, youtube_df], ignore_index=True)

In [88]:
# Here I am deleting emjoies since youtube comments has a lot of them.
def change_emogy(text):
    emojis = re.compile("["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols
        u"\U0001F680-\U0001F6FF"  # Maps
        u"\U0001F1E0-\U0001F1FF"  # Flags
        u"\U00002500-\U00002BEF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)
    
    clean_text = emojis.sub(r'', text)
    """if len(clean_text) <= 50:
        return np.nan
    else:"""
    return clean_text 

human["Text"] = human["Text"].apply(change_emogy)

In [6]:
human = human.dropna()

In [None]:
human

### AI Generated

#### Reading sentences through PDF file

In [42]:
np.random.seed(1773)

def extract_sentences(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        all_sentences = []

        # Iterating through each page.
        for page_num in range(len(reader.pages)):
            # Selecting page.
            page = reader.pages[page_num]
            # Obtaining the text inside it.
            text = page.extract_text()

            # Using regular expression to split the text into sentences.
            sentences = re.split(r'(?<=[.!?]) +', text)

            sentence_count = 3
            # Grouping every three sentences.
            for i in range(0, len(sentences), sentence_count):
                group = ' '.join(sentences[i:i+sentence_count])
                if len(group) != 0:
                    all_sentences.append(group)

    return all_sentences

In [43]:
# Here, we are extracting 3 sentence long texts from our provided PDF file and we will use them as prompt to GPT 3.5.
pdf_path = './The Diary of a Young Girl.pdf'
texts = extract_sentences(pdf_path)
print(len(texts))

2472


In [46]:
for i,text in enumerate(texts):
    if len(text) <= 50:
        del texts[i]

#### Regenerating sentences with gpt 3.5 turbo.

In [55]:
generate_texts = []

# API KEY.
openai.api_key = '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'

# Generating the texts again from chat gpt.
def generate_text(messages):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # The chat model.
        messages=messages,
        max_tokens=150,
        temperature=0.7
    )
    return response['choices'][0]['message']['content'].strip()

In [None]:
# Rephrasing every text from Chat GPT.
for i in range(0, len(texts)):
    conversation = [ {"role": "user", "content": "{} \n Please rephrase this text.".format(texts[i])} ]
    generated_text = generate_text(conversation)

    generate_texts.append(generated_text)

In [104]:
ai = pd.DataFrame(columns=["Source" ,"Text" ,"IsAI"])

In [66]:
# Storing the data as DataFrame.
new_data = pd.DataFrame({
    'Source': "gpt-3.5-turbo",
    'Text': generate_texts,
    'IsAI': 'yes'
})

In [67]:
ai = pd.concat([ai, new_data], ignore_index=True)

In [68]:
ai

Unnamed: 0,Source,Text,IsAI
0,gpt-3.5-turbo,He writes that investigators are currently hin...,yes
1,gpt-3.5-turbo,The District Court determined that the petitio...,yes
2,gpt-3.5-turbo,The second aim of this legislation is to safeg...,yes
3,gpt-3.5-turbo,"In a situation like this, it is futile for a c...",yes
4,gpt-3.5-turbo,The opinion of the court was delivered by Mr. ...,yes
...,...,...,...
4173,gpt-3.5-turbo,Are all non-military German women being evacua...,yes
4174,gpt-3.5-turbo,He had the opportunity to do that in Russia mu...,yes
4175,gpt-3.5-turbo,"Dear Kitty, \n\nI am curious to know how much ...",yes
4176,gpt-3.5-turbo,The weather has been consistently bad from the...,yes


### Available dataset

This data was available publicly within a paper that focused on the same topic with us.

Here is the link: https://github.com/dukeraphaelng/synth_detectives

In [80]:
ready = pd.read_csv("./pre-made.csv")

In [81]:
# We did some pre-cleaning and adjusted the columns like we wanted.
ready

Unnamed: 0,Source,Text,IsAI
0,LLM,Have you ever heard of the Crusades? A time in...,yes
1,LLM,"The professors, who likely have nearly a decad...",yes
2,LLM,Kemba Walker does a good job of defending Foye...,yes
3,LLM,"Ganias' lawyer, Stanley Twardy, urged the gove...",yes
4,PDF,The Circuit Court of Appeals of New Jersey had...,no
...,...,...,...
17995,LLM,Toner cited a $38 billion defense assistance a...,yes
17996,PDF,In determining whether a government regulation...,no
17997,PDF,Mr. A.B. Burdett for appellee. Mr. Henry S. Bu...,no
17998,LLM,"After this flag was called on Jo-Lonn Dunbar, ...",yes


### Final Data

In [12]:
# Joining all the data we have generated and adjusted so far.
final_data = pd.concat([ready, human, ai], ignore_index=True)