In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
train_essays = pd.read_csv("/content/drive/MyDrive/train_essays.csv")
test_essays = pd.read_csv("/content/drive/MyDrive/test_essays.csv")

In [4]:
train_essays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1378 entries, 0 to 1377
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1378 non-null   object
 1   prompt_id  1378 non-null   int64 
 2   text       1378 non-null   object
 3   generated  1378 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 43.2+ KB


In [5]:
train_essays.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    words = text.split()  # Tokenize
    words = [word.lower() for word in words if word.isalpha()]  # Lowercase and remove non-alphabetic words
    words = [word for word in words if word not in stop_words]  # Remove stop words
    return ' '.join(words)

train_essays['clean_text'] = train_essays['text'].apply(clean_text)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(train_essays['clean_text'], train_essays['generated'], test_size=0.2, random_state=42)


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, padding=True, truncation=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
encoded_train = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt')
encoded_val = tokenizer(X_val.tolist(), padding=True, truncation=True, return_tensors='pt')

In [11]:
train_labels = torch.tensor(y_train.values)
val_labels = torch.tensor(y_val.values)

In [12]:
train_dataset = TensorDataset(encoded_train['input_ids'], encoded_train['attention_mask'], train_labels)
val_dataset = TensorDataset(encoded_val['input_ids'], encoded_val['attention_mask'], val_labels)

In [13]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [14]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
epochs = 20



In [16]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping to avoid exploding gradients
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss:.2f}")

Epoch 1/20, Average Training Loss: 0.04
Epoch 2/20, Average Training Loss: 0.02
Epoch 3/20, Average Training Loss: 0.01
Epoch 4/20, Average Training Loss: 0.01
Epoch 5/20, Average Training Loss: 0.01
Epoch 6/20, Average Training Loss: 0.00
Epoch 7/20, Average Training Loss: 0.00
Epoch 8/20, Average Training Loss: 0.00
Epoch 9/20, Average Training Loss: 0.00
Epoch 10/20, Average Training Loss: 0.00
Epoch 11/20, Average Training Loss: 0.00
Epoch 12/20, Average Training Loss: 0.00
Epoch 13/20, Average Training Loss: 0.00
Epoch 14/20, Average Training Loss: 0.00
Epoch 15/20, Average Training Loss: 0.00
Epoch 16/20, Average Training Loss: 0.00
Epoch 17/20, Average Training Loss: 0.00
Epoch 18/20, Average Training Loss: 0.00
Epoch 19/20, Average Training Loss: 0.00
Epoch 20/20, Average Training Loss: 0.00


In [17]:
model.save_pretrained('/content/drive/My Drive/bert_model')

# Save the tokenizer to Google Drive
tokenizer.save_pretrained('/content/drive/My Drive/bert_tokenizer')

('/content/drive/My Drive/bert_tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/bert_tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/bert_tokenizer/vocab.txt',
 '/content/drive/My Drive/bert_tokenizer/added_tokens.json')

In [18]:
model.eval()
val_preds = []
val_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

In [19]:
val_accuracy = accuracy_score(val_labels, val_preds)
print(f"Validation Accuracy: {val_accuracy:.2f}")

Validation Accuracy: 1.00


In [25]:
import pandas as pd
import torch
from transformers import BertTokenizer


def manual_testing(text):

    testing_essay = {"text": [text]}
    test_df = pd.DataFrame(testing_essay)
    test_df['text'] = test_df['text'].apply(clean_text)


    test_inputs = tokenizer(test_df['text'].tolist(), padding=True, truncation=True, return_tensors='pt')


    test_inputs = {key: value.to(device) for key, value in test_inputs.items()}


    with torch.no_grad():
        outputs = model(**test_inputs)
        logits = outputs.logits


    probabilities = torch.softmax(logits, dim=1)[:, 1].cpu().numpy() * 100
    return print(f"Predicted Probability (AI-Generated): {probabilities[0]:.2f}%")




In [26]:
text = str(input())
manual_testing(text)


Developed a comprehensive travel solution featuring a Travel Itinerary Planner, Group Travel Planner, Travel Community, and other key features, prioritizing user convenience and engagement based on extensive market research and feature prioritization metrics.
Predicted Probability (AI-Generated): 1.81%
