## AI Text Detector

### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk
import re
from spellchecker import SpellChecker
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import pandas as pd

### Preprocessing

In [2]:
# Make sure to download the necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

class TextPreprocessor:
    def __init__(self):
        self.spell = SpellChecker()

    def word_length(self, tex):
        all_words = [len(word) for word in tex.split()]
        return np.mean(all_words), np.median(all_words)

    def word_length2(self, tex):
        all_words = [len(word) for word in nltk.tokenize.word_tokenize(tex) if word not in string.punctuation]
        return np.mean(all_words), np.median(all_words)

    def count_numbers(self, tex):
        # Regular expression to match integers and floating-point numbers
        numbers = re.findall(r'\b\d+(\.\d+)?\b', tex)
        return len(numbers)

    def count_proper_nouns(self, tex):
        # Tokenize the text
        tokens = nltk.word_tokenize(tex)
        # POS tagging
        tagged = nltk.pos_tag(tokens)
        # Count proper nouns (NNP, NNPS)
        proper_nouns = [word for word, tag in tagged if tag in ('NNP', 'NNPS')]
        return len(proper_nouns)

    def check_typos_and_replace(self, tex):
        # Remove hyphenated words using regex (i.e., words with a hyphen)
        tex_clean = re.sub(r'\b\w+-\w+\b', '', tex)  # Removes words like 'car-centric', 'car-infested', etc.
        tex_clean = re.sub(r'\w+\'s\b', '', tex_clean)  # Removes possessive 's (e.g., John's, car's)
        # Remove digits (e.g., 123, 456, 789)
        tex_clean = re.sub(r'\d+', '', tex_clean)
        # Remove words that end with a period (e.g., "word." becomes "word")
        tex_clean = re.sub(r'\b\w+\.\b', '', tex_clean)
        # Remove punctuation
        tex_clean = re.sub(r'[^\w\s]', '', tex_clean)

        # Tokenize the cleaned text
        words = nltk.tokenize.word_tokenize(tex_clean)

        # POS tagging to identify proper nouns
        tagged_words = nltk.pos_tag(words)

        # Filter out proper nouns (NNP and NNPS tags are for singular and plural proper nouns)
        words_without_proper_nouns = [word for word, tag in tagged_words if tag not in ('NNP', 'NNPS')]

        # Remove punctuation and convert to lowercase
        words_filtered = [word.lower() for word in words_without_proper_nouns if word not in string.punctuation]

        # Find all the misspelled words
        misspelled = self.spell.unknown(words_filtered)

        # Replace the misspelled words with "TYPO"
        words_with_typos_replaced = [
            'TYPO' if word.lower() in misspelled else word
            for word in nltk.tokenize.word_tokenize(tex)
        ]
        
        # Reconstruct the text after replacing the misspelled words
        tex_with_typos_replaced = ' '.join(words_with_typos_replaced)

        return tex_with_typos_replaced

    def preprocess_data(self, df):
        print("Starting preprocessing...")
        
        # Get text length (character count)
        df['text_length'] = df['text'].str.len()
        
        # Get word lengths (mean, max)
        df['mean_word_length'] = df['text'].map(lambda tex: np.mean([len(word) for word in tex.split()]))
        df['max_word_length'] = df['text'].map(lambda tex: np.max([len(word) for word in tex.split()]))
        
        # Count unique words
        df['unique_word_count'] = df['text'].map(lambda tex: len(set([word.lower() for word in nltk.tokenize.word_tokenize(tex) if word not in string.punctuation])))
        
        # Sentence length analysis
        df['sentences'] = df['text'].map(lambda tex: nltk.sent_tokenize(tex))  # Using nltk method
        df['sentence_length'] = df['sentences'].map(lambda x: len(x))
        df['mean_sentence'] = df['sentences'].map(lambda x: np.mean([len(i) for i in x]))
        
        # Count numbers
        df['number_count'] = df['text'].map(self.count_numbers)
        
        # Count proper nouns
        df['proper_noun_count'] = df['text'].map(self.count_proper_nouns)
        
        # Correct typos
        df['text_with_typos_replaced'] = df['text'].map(self.check_typos_and_replace)
        
        print("Preprocessing completed.")
        return df


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sahanamanjunath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sahanamanjunath/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Loading tokenizer and model

In [3]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define the Dataset class (same as you used in training)
class TextDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text_with_typos_replaced'].tolist()
        self.labels = df['generated'].tolist()
        self.features = df[['text_length', 'mean_word_length', 'sentence_length', 'mean_sentence', 
                            'unique_word_count', 'proper_noun_count', 'number_count']].values
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return input_ids, attention_mask, features, label

# Define the model (same as you used in training)
class BertClassifier(nn.Module):
    def __init__(self, num_features):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(768 + num_features, 2)  # 768 for BERT hidden size, + num_features
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        
    def forward(self, input_ids, attention_mask, numerical_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        combined_input = torch.cat((pooled_output, numerical_features), dim=1)
        x = self.dropout(self.relu(combined_input))
        x = self.fc(x)
        return x

# Load the BERT part of the model weights
model = BertClassifier(num_features=7)  # 7 features as per your dataset

# Load BERT model weights
bert_model = BertModel.from_pretrained("bert-base-uncased")
model.bert.load_state_dict(bert_model.state_dict(), strict=False)

# Now, load the weights for the custom layers (e.g., fully connected layer)
model.load_state_dict(torch.load("model_weights.pth", map_location=torch.device('cpu')), strict=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relations

_IncompatibleKeys(missing_keys=['bert.embeddings.position_ids'], unexpected_keys=[])

In [4]:
# Set the model to evaluation mode
model.eval()

# Move model to device (now it should be on CPU)
device = torch.device('cpu')
model.to(device)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

### Inference based on single example

In [5]:
# Sample test essay (replace this with your actual test essay)
sample_essay = "Artificial Intelligence has become soo popular now! I remember doing all my homeworks myself early 2000s but now I take help of GPT"

# Convert the essay into a DataFrame with a 'text' column
test_df = pd.DataFrame({'text': [sample_essay]})

# Instantiate the preprocessor
preprocessor = TextPreprocessor()

# Run preprocessing on the single essay
processed_test_df = preprocessor.preprocess_data(test_df)



Starting preprocessing...
Preprocessing completed.


In [6]:
processed_test_df

Unnamed: 0,text,text_length,mean_word_length,max_word_length,unique_word_count,sentences,sentence_length,mean_sentence,number_count,proper_noun_count,text_with_typos_replaced
0,Artificial Intelligence has become soo popular...,131,4.73913,12,21,[Artificial Intelligence has become soo popula...,2,65.0,0,2,Artificial Intelligence has become soo popular...


In [8]:
class TextDatasetTest(Dataset):
    def __init__(self, df):
        # Extract the same columns as the training set, but without labels
        self.texts = df['text_with_typos_replaced'].tolist()
        self.features = df[['text_length', 'mean_word_length', 'sentence_length', 'mean_sentence', 
                            'unique_word_count', 'proper_noun_count', 'number_count']].values
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        # Tokenize the text and convert it to input format
        encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        
        # Convert the numerical features to tensor
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        
        # Return the inputs (input_ids, attention_mask, features) without labels
        return input_ids, attention_mask, features


In [9]:
# Tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

val_dataset = TextDatasetTest(processed_test_df)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [10]:
from tqdm import tqdm

# Function to classify a single test sample (one row of text)
def classify_single_sample(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode

    # Initialize variables for classification
    predictions = []

    with torch.no_grad():  # No need to track gradients during inference
        # Wrap test_loader with tqdm for progress bar
        progress_bar = tqdm(test_loader, desc="Classifying Sample", leave=True)

        for input_ids, attention_mask, numerical_features in progress_bar:
            input_ids, attention_mask, numerical_features = (
                input_ids.to(device),
                attention_mask.to(device),
                numerical_features.to(device),
            )

            # Forward pass to get the model's output
            outputs = model(input_ids, attention_mask, numerical_features)
            
            # Get the predicted class (0 for human, 1 for AI, assuming binary classification)
            _, predicted = torch.max(outputs, 1)
            predictions.append(predicted.item())

            # Update the progress bar with the latest prediction
            progress_bar.set_postfix(prediction=predicted.item())

    # Output the final prediction for the single sample
    if predictions:
        print(f"Prediction: {'AI' if predictions[0] == 1 else 'Human'}")

In [11]:
# Call the classification function on the test data
classify_single_sample(model, val_loader, device)

Classifying Sample: 100%|███████████| 1/1 [00:00<00:00,  5.85it/s, prediction=0]

Prediction: Human



