# **Dataset Gathering**

In [None]:
import os
from google.colab import userdata

os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')


!kaggle datasets download -d shanegerami/ai-vs-human-text


Dataset URL: https://www.kaggle.com/datasets/shanegerami/ai-vs-human-text
License(s): other
Downloading ai-vs-human-text.zip to /content
100% 349M/350M [00:02<00:00, 137MB/s]
100% 350M/350M [00:02<00:00, 128MB/s]


In [None]:
# Unzipping the downloaded dataset
!unzip -q ai-vs-human-text.zip -d ./ai-vs-human-text

In [None]:
import pandas as pd

data = pd.read_csv('/content/ai-vs-human-text/AI_Human.csv')
print(data.head())

                                                text  generated
0  Cars. Cars have been around since they became ...        0.0
1  Transportation is a large necessity in most co...        0.0
2  "America's love affair with it's vehicles seem...        0.0
3  How often do you ride in a car? Do you drive a...        0.0
4  Cars are a wonderful thing. They are perhaps o...        0.0


In [None]:
data.shape

(487235, 2)

In [None]:
counts = data.groupby('generated').size()
print("Counts:")
print(counts)

Counts:
generated
0.0    305797
1.0    181438
dtype: int64


In [None]:
data.isnull().sum()

text         0
generated    0
dtype: int64

In [None]:
data_collected = data.rename(columns={'generated': 'label'})

# **Limiting dataset**

In [None]:
import pandas as pd

# Assuming ds is your initial DataFrame
# Filter the DataFrame for each category
df_0 = data[data['generated'] == 0.0]
df_1 = data[data['generated'] == 1.0]

# Sample 10,000 rows from each DataFrame if they have at least 10,000 rows
n_samples = 1000  # Number of samples per category

if len(df_0) >= n_samples and len(df_1) >= n_samples:
    df_0_sampled = df_0.sample(n=n_samples, random_state=42)
    df_1_sampled = df_1.sample(n=n_samples, random_state=42)
else:
    raise ValueError("Not enough data to sample from one or both categories.")

# Combine the sampled data
balanced_ds = pd.concat([df_0_sampled, df_1_sampled])

# Shuffle the DataFrame
balanced_ds = balanced_ds.sample(frac=1, random_state=42).reset_index(drop=True)

# Now balanced_ds is ready for use
print(balanced_ds.head())
print("Distribution of 'generated':", balanced_ds['generated'].value_counts())


                                                text  generated
0  Hey there! \n\nSo, you know how people say "ki...        1.0
1  As a child I can remember the many times my te...        0.0
2  While distance learning offers convenience and...        1.0
3  Dear Principal,\n\nI do not think that you sho...        0.0
4   I agree with the prompt that change is possib...        1.0
Distribution of 'generated': generated
1.0    1000
0.0    1000
Name: count, dtype: int64


# **Data Preprocessing**

### **Tokenization**

**NLTK tokenize**

In [None]:
!pip -q install nltk
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Unnamed: 0,text,generated,tokens
0,Is important for young people to attend.\n\nGr...,1.0,"[Is important for young people to attend., Gra..."
1,As the world beys moved into a technological e...,0.0,[As the world beys moved into a technological ...
2,"Dear, senator I hereby to inform you that I'm ...",0.0,"[Dear, senator I hereby to inform you that I'm..."
3,"As a student Myself, I think students would be...",0.0,"[As a student Myself, I think students would b..."
4,"Dear Principal,\n\nI understand your concern ...",1.0,"[ Dear Principal,\n\nI understand your concern..."


**Spacy tokenize**

In [None]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

text = "Hello there! How are you doing today?"
doc = nlp(text)

# Sentence Tokenization
sentences = [sent.text for sent in doc.sents]
print("Sentence Tokens:", sentences)

Sentence Tokens: ['Hello there!', 'How are you doing today?']


In [None]:
from transformers import BertTokenizer

# Load a pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text = "Hello there! How are you doing today?"
encoded_input = tokenizer(text)

# Tokens and token IDs
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])
print("Subword Tokens:", tokens)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Subword Tokens: ['[CLS]', 'hello', 'there', '!', 'how', 'are', 'you', 'doing', 'today', '?', '[SEP]']


### **Lowercasing**

In [None]:
balanced_ds['text'] = balanced_ds['text'].str.lower()
balanced_ds['text'].head()

0    is important for young people to attend.\n\ngr...
1    as the world beys moved into a technological e...
2    dear, senator i hereby to inform you that i'm ...
3    as a student myself, i think students would be...
4     dear principal,\n\ni understand your concern ...
Name: text, dtype: object

### **Removing Punctuation and Special Characters**

In [None]:
import re

def remove_punctuation(text):
    # Replace all non-alphanumeric characters with an empty string
    return re.sub(r'[^\w\s]', '', text) if type(text) == str else text

balanced_ds['text'] = balanced_ds['text'].apply(remove_punctuation)

### **Removing Stop Words**

In [None]:
from nltk.corpus import stopwords

# Download the stopwords from NLTK
nltk.download('stopwords')

# Load stop words
stop_words = set(stopwords.words('english'))


def remove_stopwords(text):
    # Ensure text is a string
    if isinstance(text, str):
        # Tokenize the text into words
        words = text.split()
        # Remove stop words
        filtered_words = [word for word in words if word.lower() not in stop_words]
        # Join words back into a single string
        return " ".join(filtered_words)
    return text


balanced_ds['text'] = balanced_ds['text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(balanced_ds['text'].head())

0    important young people attend graduating early...
1    world beys moved technological era multitude o...
2    dear senator hereby inform im favor changing e...
3    student think students would benefit attending...
4    dear principal understand concern new school p...
Name: text, dtype: object


In [None]:
from nltk.tokenize import sent_tokenize

#data_collected['tokens'] = data_collected['text'].apply(sent_tokenize)
balanced_ds['tokens'] = balanced_ds['text'].apply(sent_tokenize)
balanced_ds.head()

Unnamed: 0,text,generated,tokens
0,important young people attend graduating early...,1.0,[important young people attend graduating earl...
1,world beys moved technological era multitude o...,0.0,[world beys moved technological era multitude ...
2,dear senator hereby inform im favor changing e...,0.0,[dear senator hereby inform im favor changing ...
3,student think students would benefit attending...,0.0,[student think students would benefit attendin...
4,dear principal understand concern new school p...,1.0,[dear principal understand concern new school ...


### **Stemming/Lemmatization**

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# Setup
nltk.download('punkt')
stemmer = PorterStemmer()

# Function to apply lowercasing, remove punctuation, stop words, and perform stemming
def process_text(text):
    words = word_tokenize(text.lower())  # Lowercasing and tokenizing
    words = [word for word in words if word.isalpha()]  # Removing punctuation
    words = [word for word in words if word not in nltk.corpus.stopwords.words('english')]  # Removing stop words
    stemmed_words = [stemmer.stem(word) for word in words]  # Stemming
    return ' '.join(stemmed_words)

# Applying the processing function to the text column
balanced_ds['processed_text'] = balanced_ds['text'].apply(process_text)

# Display results
print(balanced_ds[['text', 'processed_text']])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                   text  \
0     Hey there! \n\nSo, you know how people say "ki...   
1     As a child I can remember the many times my te...   
2     While distance learning offers convenience and...   
3     Dear Principal,\n\nI do not think that you sho...   
4      I agree with the prompt that change is possib...   
...                                                 ...   
1995  Hey there!  So, I've been thinking a lot about...   
1996  There are several advantages to using electron...   
1997  Yes S identify with Churchill and his statemen...   
1998  [Your Name]\n[Your Address]\n[City, State, Zip...   
1999  Code]\n\nDear Senator [Senator's Last Name],\n...   

                                         processed_text  
0     hey know peopl say kind goe long way yeah talk...  
1     child rememb mani time teacher would convers p...  
2     distanc learn offer conveni flexibl believ stu...  
3     dear princip think make ct mandatori student p...  
4

# **Feature Extraction**

## **TD-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=1, max_features=10000)



In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(balanced_ds['processed_text'])



In [None]:
# Viewing the feature names (optional)
feature_names = tfidf_vectorizer.get_feature_names_out()
print("Some feature names:", feature_names[:20])

Some feature names: ['aa' 'aaa' 'aachieveour' 'aactiv' 'aad' 'aaddingmor' 'aadvantageof' 'aae'
 'aafter' 'aai' 'aain' 'aaldbefor' 'aaldyou' 'aaltitud' 'aalwayssay' 'aan'
 'aaowour' 'aaowsom' 'aap' 'aaptitud']


# **Machine Learning (Classical vs Deep Learning)**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

y = balanced_ds['generated']

# Assuming `y` is your labels array corresponding to each row in tfidf_matrix
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.20, random_state=42)

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42, max_iter=1000)  # Increasing max_iter for convergence if necessary
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report

# Predicting the Test set results
y_pred = model.predict(X_test)

# Evaluating the results
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.94
              precision    recall  f1-score   support

         0.0       0.95      0.92      0.94       197
         1.0       0.93      0.96      0.94       203

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400
weighted avg       0.94      0.94      0.94       400

