# Data collection from public sources:

The Yelp data is retrived from: https://business.yelp.com/data/resources/open-dataset/

In [None]:
import tarfile

tar_path = "yelp_dataset.tar"  

with tarfile.open(tar_path, "r:*") as tar:   # "r:*" auto-detects compression
    members = tar.getnames()  # list of file paths inside
    for m in members[:50]:    # print first 50 entries
        print(m)


Dataset_User_Agreement.pdf
yelp_academic_dataset_business.json
yelp_academic_dataset_checkin.json
yelp_academic_dataset_review.json
yelp_academic_dataset_tip.json
yelp_academic_dataset_user.json


In [None]:
import pandas as pd
import tarfile
import json

tar_path = "yelp_dataset.tar"

rows = []
with tarfile.open(tar_path, "r:*") as tar:
    f = tar.extractfile("yelp_academic_dataset_review.json")

    for i, line in enumerate(f):
        rows.append(json.loads(line))
        if i == 99999:   # take first 1000 reviews
            break

df = pd.DataFrame(rows)
print(df.head())


                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1  BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   

   stars  useful  funny  cool  \
0    3.0       0      0     0   
1    5.0       1      0     1   
2    3.0       0      0     0   
3    5.0       1      0     1   
4    4.0       1      0     1   

                                                text                 date  
0  If you decide to eat here, just be aware it is...  2018-07-07 22:09:11  
1  I've taken a lot of spin classes over the year...  2012-01-03 15:28:18  
2  Family diner. Had the buffet. Eclectic assortm...  2014-02-05 20:30:30  
3  Wow!  Yummy, different,  delici

In [None]:
 df.isna().sum()

review_id      0
user_id        0
business_id    0
stars          0
useful         0
funny          0
cool           0
text           0
date           0
dtype: int64

HC3 Dataset – Human ChatGPT Comparison Corpus, which pairs AI-generated responses with human-written ones.
Link: https://huggingface.co/datasets/Hello-SimpleAI/HC3


In [None]:
df_hc = pd.read_csv("HC3.csv")
df_hc

Unnamed: 0,train
0,"{'id': '0', 'question': 'Why is every book I h..."
1,"{'id': '1', 'question': ""If salt is so bad for..."
2,"{'id': '2', 'question': ""Why do we still have ..."
3,"{'id': '3', 'question': ""Why has nobody assass..."
4,"{'id': '4', 'question': ""How was airplane tech..."
...,...
24317,"{'id': '24317', 'question': 'Is rise in pressu..."
24318,"{'id': '24318', 'question': 'What could cause ..."
24319,"{'id': '24319', 'question': 'Can Acutret be gi..."
24320,"{'id': '24320', 'question': 'Are BP of 119/65 ..."


In [None]:
import ast
# Convert string dictionaries into Python dicts
df_hc["train"] = df_hc["train"].apply(ast.literal_eval)

# Normalize into separate columns
df_hc = pd.json_normalize(df_hc["train"])

print(df_hc.head())

  id                                           question  \
0  0  Why is every book I hear about a " NY Times # ...   
1  1  If salt is so bad for cars , why do we use it ...   
2  2  Why do we still have SD TV channels when HD lo...   
3  3  Why has nobody assassinated Kim Jong - un He i...   
4  4  How was airplane technology able to advance so...   

                                       human_answers  \
0  [Basically there are many categories of " Best...   
1  [salt is good for not dying in car crashes and...   
2  [The way it works is that old TV stations got ...   
3  [You ca n't just go around assassinating the l...   
4  [Wanting to kill the shit out of Germans drive...   

                                     chatgpt_answers       source  
0  [There are many different best seller lists th...  reddit_eli5  
1  [Salt is used on roads to help melt ice and sn...  reddit_eli5  
2  [There are a few reasons why we still have SD ...  reddit_eli5  
3  [It is generally not acceptable o

In [None]:
df_hc.isna().sum()

id                 0
question           0
human_answers      0
chatgpt_answers    0
source             0
dtype: int64

In [None]:
df_ai = pd.read_csv("AI_Human.csv")
df_ai[df_ai["generated"]==1]

Unnamed: 0,text,generated
704,"This essay will analyze, discuss and prove one...",1.0
740,I strongly believe that the Electoral College ...,1.0
1262,"Limiting car use causes pollution, increases c...",1.0
1378,Car-free cities have become a subject of incre...,1.0
1379,"Car Free Cities Car-free cities, a concept ga...",1.0
...,...,...
487208,I agree that online or video Conferencing dis...,1.0
487211,The Benefits of Limiting Car Usage \n\nMany ci...,1.0
487217,"As an eighth grade student, I believe that att...",1.0
487222,It is a common belief that having a broad know...,1.0


In [None]:
df_ai.isna().sum()

text         0
generated    0
dtype: int64

In [None]:
df_ai[df_ai["generated"]==1].shape

(181438, 2)

In [None]:
import bz2
import pandas as pd
# Example: read 100k reviews in chunks
chunk_size = 100000
rows = []
path = "train.ft.txt.bz2"

with bz2.open(path, "rt", encoding="utf-8", errors="ignore") as f:
    for i, line in enumerate(f):
        parts = line.strip().split(" ", 1)
        rows.append((parts[0].replace("__label__", ""), parts[1]))

        if (i+1) % chunk_size == 0 and i<=100000:
            df_amz = pd.DataFrame(rows, columns=["label", "text"])
            print(f"Loaded {i+1} rows")
            rows = []   # reset buffer
        elif i>100000:
            break


Loaded 100000 rows


Amazon Reviews (Kaggle) – includes detailed product reviews across categories.
Link: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews


In [None]:
df_amz

Unnamed: 0,label,text
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
99995,2,Good: very good excelent fantastic wonderful m...
99996,1,THE REAL SUPERMAN: THESE ARE THE REAL ADVENTUR...
99997,1,It gets one star because zero was not an optio...
99998,1,Don't Spend Your Money: I think I now know mor...


In [None]:
df_amz.isna().sum()

label    0
text     0
dtype: int64

Segregated human and AI data

In [None]:
## Human Dataset
df
df_amz

## AI Dataset
df_ai
df_hc

In [None]:
import json

with open("Final_reviews.json", "r", encoding="utf-8") as f:
    results = json.load(f)

results

['The Palace Hotel Dubai Downtown is a beautiful property in every way, but what truly made our stay unforgettable was the pool area. The pool itself is immaculate, always clean and well-kept, but the real magic comes from the people who look after it.||From the moment we arrived, the team around the pool went above and beyond. Bhuwan, Susil and Furba are some of the friendliest and kindest people you could ever meet. They couldn’t do enough for us, always attentive, always with a smile, and they made me and my entire family feel completely relaxed.||Thanks to them, this felt like the best holiday we’ve ever had. Their service, warmth, and genuine care turned a great hotel stay into something truly special. Palace Downtown is stunning, but for us, it was the pool team who made the experience unforgettable',
 "The Hotel have the best place to stay Downtown. We get a very nice room and Service big thanks to Lisha and Waseem from the Reception. The Service everyone is perfect. Nadab makes

In [None]:
type(results)

list

In [None]:
df_ext = pd.DataFrame(results, columns=["text"])
df_ext

Unnamed: 0,text
0,The Palace Hotel Dubai Downtown is a beautiful...
1,The Hotel have the best place to stay Downtown...
2,I had an exceptional experience at The Palace ...
3,A luxurious entrance to this beautiful hotel w...
4,This charming establishment positively oozes c...
...,...
855,My mother recently traveled to Dubai for the v...
856,"Our time at JA Beach Hotel was wonderful, and ..."
857,Our stay at JA Beach Hotel was a truly relaxin...
858,We had a fantastic 2 week stay at the JA Beach...


In [None]:
## Human Dataset
print("1",df.columns)
print("2",df_amz.columns)


## AI Dataset
print("3",df_ai.columns)
print("4",df_hc.columns)

1 Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')
2 Index(['label', 'text'], dtype='object')
3 Index(['text', 'generated'], dtype='object')
4 Index(['id', 'question', 'human_answers', 'chatgpt_answers', 'source'], dtype='object')


In [None]:
df_ai_h = df_ai[df_ai["generated"]==0]
df_ai_h.columns

Index(['text', 'generated'], dtype='object')

Only human data is extracted.

In [None]:
df_human = pd.concat([df[['text']], df_amz[['text']], df_ext[['text']], df_ai_h[["text"]]], ignore_index=True)

df_human.head()


Unnamed: 0,text
0,"If you decide to eat here, just be aware it is..."
1,I've taken a lot of spin classes over the year...
2,Family diner. Had the buffet. Eclectic assortm...
3,"Wow! Yummy, different, delicious. Our favo..."
4,Cute interior and owner (?) gave us tour of up...


In [None]:
print(df_human.shape)

(506657, 1)


In [None]:
df_human = df_human.sample(n=205760, random_state=42)  # random_state for reproducibility
df_human = df_human.reset_index(drop=True)


In [None]:
df_human.shape

(205760, 1)

In [None]:
df_human.isna().sum()

text    0
dtype: int64

Only Ai written data is extracted here.

In [None]:
# Filter rows where 'generated' == 1
df_ai_ai = df_ai[df_ai["generated"] == 1]

# Rename df_hc column to 'generated'
df_hc_renamed = df_hc[["chatgpt_answers"]].rename(columns={"chatgpt_answers": "text"})

# Concatenate
df_aii = pd.concat(
    [df_ai_ai[["text"]], df_hc_renamed],
    ignore_index=True
)

df_aii.shape


(205760, 1)

In [None]:
df_aii.isna().sum()

text    0
dtype: int64

In [None]:
df_aii = df_aii.rename(columns={'text': 'generated'})
df_aii

Unnamed: 0,generated
0,"This essay will analyze, discuss and prove one..."
1,I strongly believe that the Electoral College ...
2,"Limiting car use causes pollution, increases c..."
3,Car-free cities have become a subject of incre...
4,"Car Free Cities Car-free cities, a concept ga..."
...,...
205755,[It's not uncommon for blood pressure to fluct...
205756,[There are several possible causes of a painle...
205757,[It is not appropriate for me to recommend a s...
205758,[It is not uncommon for people with rheumatoid...


In [None]:
df_aii.shape

(205760, 1)

Converting Human data and AI writen data seperately as csv file.

In [None]:
df_human.to_csv("df_Human.csv", index=False)
df_aii.to_csv("df_ai.csv", index=False)