## Loading the Data

In [None]:
import pandas as pd

df_human = pd.read_csv("df_human.csv")
df_ai = pd.read_csv("df_ai.csv")

In [None]:
df_human.shape

(205760, 3)

In [None]:
df_ai.shape

(205758, 3)

##  Data Pre-Processing

In [None]:
import re
import spacy
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tqdm import tqdm
import pandas as pd

# Load SpaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stop_words = set(stopwords.words('english'))

# Ensure tqdm works with pandas
tqdm.pandas()

def preprocess_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""

    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # Lowercase
    text = text.lower()

    # Remove URLs and emails
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\S+@\S+', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    return text

In [None]:
# Apply preprocessing first (removes HTML, URLs, etc.)
df_human['preprocessed'] = df_human['text'].progress_apply(preprocess_text)

100%|████████████████████████████████████████████████████████████████████████| 205760/205760 [01:50<00:00, 1860.24it/s]


In [None]:
df_ai['preprocessed'] = df_ai['generated'].progress_apply(preprocess_text)

100%|████████████████████████████████████████████████████████████████████████| 205758/205758 [02:15<00:00, 1517.21it/s]


In [None]:

def lemmatize_in_chunks(texts, batch_size=500, chunk_size=10000):
    """Process texts in small chunks to avoid MemoryError"""
    results = []
    for start in range(0, len(texts), chunk_size):
        chunk = texts[start:start+chunk_size]
        for doc in tqdm(nlp.pipe(chunk, batch_size=batch_size, disable=["parser", "ner"]),
                        total=len(chunk), desc=f"Processing rows {start}-{start+len(chunk)}"):
            tokens = [token.lemma_ for token in doc if token.text not in stop_words]
            results.append(" ".join(tokens))
    return results

In [None]:
df_human['clean_text'] = lemmatize_texts(df_human['preprocessed'].tolist())
df_ai['clean_text'] = lemmatize_texts(df_ai['preprocessed'].tolist())

# Drop temporary preprocessed column if desired
df_human.drop(columns=['preprocessed'], inplace=True)
df_ai.drop(columns=['preprocessed'], inplace=True)

100%|████████████████████████████████████████████████████████████████████████| 205760/205760 [3:11:44<00:00, 17.89it/s]
 18%|█████████████▎                                                           | 37498/205758 [42:41<1:13:40, 38.06it/s]

In [None]:
df_human.to_csv("df_Human.csv", index=False)
df_ai.to_csv("df_ai.csv", index=False)

In [None]:
df_human.to_csv("df_Human.csv", index=False)

In [None]:
df_ai['clean_text'] = lemmatize_in_chunks(df_ai['preprocessed'].tolist())

Processing rows 0-10000: 100%|███████████████████████████████████████████████████| 10000/10000 [09:44<00:00, 17.12it/s]
Processing rows 10000-20000: 100%|███████████████████████████████████████████████| 10000/10000 [12:27<00:00, 13.38it/s]
Processing rows 20000-30000: 100%|███████████████████████████████████████████████| 10000/10000 [12:12<00:00, 13.65it/s]
Processing rows 30000-40000: 100%|███████████████████████████████████████████████| 10000/10000 [13:24<00:00, 12.42it/s]
Processing rows 40000-50000: 100%|███████████████████████████████████████████████| 10000/10000 [11:33<00:00, 14.42it/s]
Processing rows 50000-60000: 100%|███████████████████████████████████████████████| 10000/10000 [10:19<00:00, 16.14it/s]
Processing rows 60000-70000: 100%|███████████████████████████████████████████████| 10000/10000 [10:17<00:00, 16.19it/s]
Processing rows 70000-80000: 100%|███████████████████████████████████████████████| 10000/10000 [10:05<00:00, 16.52it/s]
Processing rows 80000-90000: 100%|██████

In [None]:
df_ai.to_csv("df_ai.csv", index=False)

In [None]:
df_human = df_human.sample(n=100000, random_state=42)  # random_state for reproducibility
df_human = df_human.reset_index(drop=True)

df_ai = df_ai.sample(n=100000, random_state=42)  # random_state for reproducibility
df_ai = df_human.reset_index(drop=True)

In [None]:
df_human.shape

(100000, 3)

In [None]:
df_ai.shape

(100000, 3)

In [None]:
df_human["label"] = 0
df_ai["label"] = 1

In [None]:
df_human.drop(['text', 'preprocessed'], axis=1, inplace=True)
df_ai.drop(['text', 'preprocessed'], axis=1, inplace=True)

In [None]:
df_human.columns

Index(['clean_text', 'label'], dtype='object')

In [None]:
df_ai.columns

Index(['clean_text', 'label'], dtype='object')

In [None]:
df_human = df_human.rename(columns={"clean_text": "text"})
df_ai = df_ai.rename(columns={"clean_text": "text"})

# Merge dataframes
df = pd.concat([df_human, df_ai], ignore_index=True)

# Shuffle rows
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.shape

(200000, 2)

In [None]:
df

Unnamed: 0,text,label
0,really rocktheir gluten free classic combogene...,1
1,car use main mean transportation pong time cou...,0
2,dear mrs \n\n\n\n think go first policy first ...,1
3,ever force something not want well principal o...,0
4,state senator electoral college complex proces...,0
...,...,...
199995,light car usage cause drop amount pollution ...,1
199996,wolrd war ii cheapen know tom brokaw secondrat...,1
199997,one good restaurant every thing use recipe sup...,1
199998,great earring price can not beat earring price...,1


In [None]:

df["text"] = df["text"].astype("string")
print(df.dtypes)


text     string
label     int64
dtype: object


Sampling data into 50000 records fror our analysis.

In [None]:
df= df.sample(n=50000, random_state=42)

In [None]:
df["label"].value_counts()

0    25184
1    24816
Name: label, dtype: int64

In [None]:
df.to_csv("df.csv", index=False)

In [None]:
df.shape

(50000, 2)

In [None]:
df_sample = df.sample(n=500,random_state=42)
df_sample.to_csv("sample_data_for_analysis.csv")