In [1]:
!pip install transformers datasets torch scikit-learn pandas accelerate matplotlib seaborn numpy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shanegerami/ai-vs-human-text")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/ai-vs-human-text


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



# Load the dataset
df = pd.read_csv(f"{path}/AI_Human.csv")

# Preview the data
print(df.head())
print(df.shape)

                                                text  generated
0  Cars. Cars have been around since they became ...        0.0
1  Transportation is a large necessity in most co...        0.0
2  "America's love affair with it's vehicles seem...        0.0
3  How often do you ride in a car? Do you drive a...        0.0
4  Cars are a wonderful thing. They are perhaps o...        0.0
(487235, 2)


In [4]:
import pandas as pd


# Split manually into train and test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['generated'])


train_df_small = train_df.groupby('generated').apply(
    lambda x: x.sample(frac=0.03, random_state=42)
)

test_df_small = test_df.groupby('generated').apply(
    lambda x: x.sample(frac=0.03, random_state=42)
)

# Preview the reduced datasets
print("Train Shape:", train_df_small.shape)
print("Test Shape:", test_df_small.shape)
print("\nTrain Class Distribution:")
print(train_df_small['generated'].value_counts(normalize=True))
print("\nTest Class Distribution:")
print(test_df_small['generated'].value_counts(normalize=True))


Train Shape: (11693, 2)
Test Shape: (2924, 2)

Train Class Distribution:
generated
0.0    0.62764
1.0    0.37236
Name: proportion, dtype: float64

Test Class Distribution:
generated
0.0    0.627565
1.0    0.372435
Name: proportion, dtype: float64


  train_df_small = train_df.groupby('generated').apply(
  test_df_small = test_df.groupby('generated').apply(


A training set of 11,693 samples and a testing set of 2,924 samples have been created from the downsampled dataset.  With around 63% of the text in both datasets being human-generated (class 0.0) and 37% being AI-generated (class 1.0), the class distribution is still unbalanced.

In [5]:
# Check class distribution before undersampling
print("Train class distribution before undersampling:")
print(train_df_small['generated'].value_counts())

# Separate human and ai generated classes
human = train_df_small[train_df_small['generated'] == 0.0]
ai = train_df_small[train_df_small['generated'] == 1.0]

# Separate human and ai generated for test data
human_test = test_df_small[test_df_small['generated'] == 0.0]
ai_test = test_df_small[test_df_small['generated'] == 1.0]

# Undersample human class in train data to match ai class size
human_undersampled = human.sample(n=len(ai), random_state=42)

# Combine ai class with downsampled human class for train data
train_df_balanced = pd.concat([human_undersampled, ai])

# Shuffle the balanced train dataset
train_df_balanced = train_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Undersample human class in test data to match ai class size
human_undersampled_test = human_test.sample(n=len(ai_test), random_state=42)

# Combine ai class with downsampled human class for test data
test_df_balanced = pd.concat([human_undersampled_test, ai_test])

# Shuffle the balanced test dataset
test_df_balanced = test_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)


# Check class distribution after undersampling for train data
print("\nTrain class distribution after undersampling:")
print(train_df_balanced['generated'].value_counts(normalize=True))

# Check class distribution after undersampling for test data
print("\nTest class distribution after undersampling:")
print(test_df_balanced['generated'].value_counts(normalize=True))

Train class distribution before undersampling:
generated
0.0    7339
1.0    4354
Name: count, dtype: int64

Train class distribution after undersampling:
generated
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64

Test class distribution after undersampling:
generated
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64


In [6]:

df = train_df_balanced.reset_index(drop=True)

# Character length summary by class
char_length_stats = df.groupby('generated')['text'].apply(lambda x: x.str.len().describe())
print("Character Length Summary by Class:\n", char_length_stats)

# Word count summary by class
word_count_stats = df.groupby('generated')['text'].apply(lambda x: x.str.split().apply(len).describe())
print("\nWord Count Summary by Class:\n", word_count_stats)


Character Length Summary by Class:
 generated       
0.0        count    4354.000000
           mean     2336.546624
           std      1052.047430
           min       269.000000
           25%      1541.000000
           50%      2140.000000
           75%      2891.000000
           max      9457.000000
1.0        count    4354.000000
           mean     2106.435921
           std       773.818839
           min       272.000000
           25%      1632.000000
           50%      2030.000000
           75%      2457.000000
           max      7075.000000
Name: text, dtype: float64

Word Count Summary by Class:
 generated       
0.0        count    4354.000000
           mean      418.721635
           std       181.447852
           min        61.000000
           25%       280.250000
           50%       389.000000
           75%       515.000000
           max      1366.000000
1.0        count    4354.000000
           mean      342.249655
           std       115.780798
        

The balanced dataset clearly distinguishes between texts created by AI and those authored by humans, according to the exploratory data analysis.  With an average of 418.7 words and 2336.5 characters, human-written texts (Class 0.0) are often lengthier.  AI-generated texts (Class 1.0), on the other hand, have a mean character count of 2106.4 and a mean word count of 342.2, making them typically shorter and less diversified.

In [7]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from collections import Counter
import re

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Set max length for tokens per sample according to EDA 75% of the dataset has text length of 500
max_text_length = 500

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text preprocessing function
def preprocess_text(text):
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Lowercase, lemmatize, remove non-alphabetic and stopwords
    processed = [
        lemmatizer.lemmatize(word.lower())
        for word in tokens
        if word.isalpha() and word.lower() not in stop_words
    ]
    return processed[:max_text_length]

# Apply preprocessing to balanced train and test data
train_df_balanced['tokens'] = train_df_balanced['text'].apply(preprocess_text)
test_df_balanced['tokens'] = test_df_balanced['text'].apply(preprocess_text)

# Flatten all tokens from balanced train data to build vocabulary
all_tokens = [token for tokens_list in train_df_balanced['tokens'] for token in tokens_list]

# Count word frequency
word_freq = Counter(all_tokens)

# Vocabulary size
vocab_size = len(word_freq)
print(f"Vocabulary size after preprocessing: {vocab_size}")

# Top 10 most common words
print("\nTop 10 most common words:")
for word, count in word_freq.most_common(10):
    print(f"{word}: {count}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Vocabulary size after preprocessing: 28668

Top 10 most common words:
student: 26200
car: 19834
people: 19081
would: 16175
school: 12499
also: 11533
help: 11432
like: 10893
time: 10494
electoral: 10030


Initially, a function to remove stopwords, lowercase, lemmatize, and tokenize text is defined.  The training and testing datasets are then subjected to this function.  The final size of the vocabulary, which is constructed from the processed training data, is presented along with the ten most often used terms.  After preprocessing, there are 49,697 words in the vocabulary.  Among the most commonly used terms are "car," "people," "student," and "would."  Words like "student," "school," and "electoral" indicate that the dataset probably includes content that is academic or essay-style.

In [8]:
print(train_df_balanced['generated'].value_counts())


generated
0.0    4354
1.0    4354
Name: count, dtype: int64


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
# max_features=5000 limits vocab to 5000 most frequent terms (including unigrams and bigrams)

# Fit on train tokens joined as strings
X_train = vectorizer.fit_transform(train_df_balanced['tokens'].apply(lambda tokens: ' '.join(tokens)))

# Transform test tokens joined as strings with the same vectorizer
X_test = vectorizer.transform(test_df_balanced['tokens'].apply(lambda tokens: ' '.join(tokens)))


In [10]:
print("Training data shape:", X_train.shape)
print("First few feature names:", vectorizer.get_feature_names_out()[:10])

Training data shape: (8708, 5000)
First few feature names: ['aad' 'ability' 'able' 'able attend' 'able drive' 'able get' 'able go'
 'able learn' 'able make' 'able see']


In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, train_df_balanced['generated'])

# Predict on test set
test_preds = clf.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

# Calculate accuracy
acc = accuracy_score(test_df_balanced['generated'], test_preds)
print(f"Logistic Regression Accuracy: {acc:.4f}")

# Display precision, recall, F1 for each class and overall
print("Classification Report:\\n", classification_report(test_df_balanced['generated'], test_preds, target_names=['0','1']))

# Compute confusion matrix
cm = confusion_matrix(test_df_balanced['generated'], test_preds)
print("Confusion Matrix:\\n", cm)

Logistic Regression Accuracy: 0.9766
Classification Report:\n               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1089
           1       0.98      0.98      0.98      1089

    accuracy                           0.98      2178
   macro avg       0.98      0.98      0.98      2178
weighted avg       0.98      0.98      0.98      2178

Confusion Matrix:\n [[1062   27]
 [  24 1065]]


With an overall accuracy of 97.66%, the model performs admirably.  For both human (class 0) and AI (class 1) texts, the classification report shows strong and balanced accuracy, recall, and F1-score values of 0.98.  The model accurately detected 1062 human texts and 1065 AI-generated texts, with a low amount of misclassifications (27 false positives and 24 false negatives), according to the confusion matrix, which provides more specific information about this performance.

In [13]:
sentence="In the ever-evolving landscape of technology, artificial intelligence continues to redefine the boundaries of possibility. From healthcare to finance, AI-driven innovations are enhancing decision-making processes, automating complex tasks, and transforming user experiences. As algorithms become more sophisticated, ethical considerations and transparency remain crucial to ensure equitable progress. The future promises even greater integration of AI in our daily lives, reshaping how we interact with the world around us."
# Preprocess and transform
tokens = preprocess_text(sentence)
tfidf_vector = vectorizer.transform([' '.join(tokens)])

# Predict using the trained classifier
predicted_label = clf.predict(tfidf_vector)[0]
predicted_prob = clf.predict_proba(tfidf_vector)[0]

# Output
print("Predicted label:", predicted_label)
print("Prediction confidence (class probabilities):", predicted_prob)

Predicted label: 1.0
Prediction confidence (class probabilities): [0.03342533 0.96657467]


In [14]:

sentence="""
Dear senator, Retain the Electoral College. The Electoral College consists of 538 electors and a majority of 270 electors is is required to elect the President. Each state has hisher own electors which are chosen by the candidate political party. You should keep the Electoral College because you have certainty of outcome, and the President is everyones not just yours.

The first reason why you should stay with the Electoral College is because you are certain that the outcome will be in favor of one of the candidates. A tie in the nationwide electoral vote may happen but it is very unlikely that it will even though that 538number of electors in the Electoral College is a even numberS.3.For example in 2012's election, Obama received 61.7 percent of the electoral votes compared to 51.3 percent of the popular cast for him and rodney because all states award electoral votes on a winnertakeall basis even a slight plurality in a state creates a landslide electoralvote victory in that stateS.3. However,because of the winnertakeall system in each state,candidates dont spend time in staes they know they have no chance of winning, they only focus on the close,tight races in the "swing"statesS.2. But, the winning candidates share of the Electoral College invariably exceeds his share of the popular vote.

The second reason you should keep the Electoral College is because the president is everyone's. The Electoral College requires a presidential candidate to have transregional appeal. No region has enough electoral votes to elect a president by themselves. So for example,a solid regional favorite,such as rodney was in the South,has no incentive to campaign heavily in those states for he gains no electoral votes by increasing his plurality in states he knows for sure that he will winS.3.A president with only his regional apppeal is very unlikely to be a successful president. The residents of the other regions may feel like there votes dont count or that he really isnt there president.

In conclusion, you should stay with the Electoral College simply because you most likely not going to have a tie and because the president is everyone's.
"""



tokens = preprocess_text(sentence)
cleaned_text = ' '.join(tokens)

tfidf_vector = vectorizer.transform([cleaned_text])


predicted_label = clf.predict(tfidf_vector)[0]
predicted_prob = clf.predict_proba(tfidf_vector)[0]


print("Predicted label:", predicted_label)
print("Prediction confidence (class probabilities):", predicted_prob)


Predicted label: 0.0
Prediction confidence (class probabilities): [0.82220408 0.17779592]


In [15]:

sentence="""
Dear Senator,

Retain the Electoral College. The Electoral College consists of 538 electors, and a majority of 270 electors is required to elect the President. Each state has its own electors, which are chosen by the candidate’s political party. You should keep the Electoral College because it provides certainty of outcome, and the President represents everyone, not just one group.

The first reason why you should stay with the Electoral College is because you are certain that the outcome will be in favor of one of the candidates. A tie in the nationwide electoral vote may happen, but it is very unlikely, even though the 538 electors in the Electoral College is an even number. For example, in the 2012 election, Obama received 61.7 percent of the electoral votes compared to 51.3 percent of the popular vote cast for him. This is because all states award electoral votes on a winner-take-all basis — even a slight plurality in a state creates a landslide electoral vote victory in that state. However, because of the winner-take-all system in each state, candidates don’t spend time in states they know they have no chance of winning; they only focus on the close, tight races in the “swing” states. But the winning candidate’s share of the Electoral College invariably exceeds his share of the popular vote.

The second reason you should keep the Electoral College is because the President is everyone’s President. The Electoral College requires a presidential candidate to have transregional appeal. No region has enough electoral votes to elect a president by itself. For example, a solid regional favorite, such as Rodney was in the South, has no incentive to campaign heavily in those states, for he gains no additional electoral votes by increasing his plurality in states he knows for sure he will win. A president with only regional appeal is very unlikely to be a successful president. The residents of other regions may feel like their votes don’t count or that he really isn’t their president.

In conclusion, you should stay with the Electoral College simply because it is very unlikely that there will be a tie, and because the President is everyone’s.
"""



tokens = preprocess_text(sentence)
cleaned_text = ' '.join(tokens)


tfidf_vector = vectorizer.transform([cleaned_text])

predicted_label = clf.predict(tfidf_vector)[0]
predicted_prob = clf.predict_proba(tfidf_vector)[0]


print("Predicted label:", predicted_label)
print("Prediction confidence (class probabilities):", predicted_prob)


Predicted label: 0.0
Prediction confidence (class probabilities): [0.79946925 0.20053075]


The model overfits, it has learnt dataset so well, even when a text with typos and grammatical error is entered it shows AI generated.