In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.5 MB/s[0m eta [36m0:00:0

In [None]:
import spacy
from collections import Counter, defaultdict
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import sent_tokenize, word_tokenize
import nltk
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import xgboost as xgb
from scipy.stats import zscore
import pandas as pd


nlp = spacy.load('en_core_web_sm')

In [None]:
def load_and_preprocess_data(path):
    # Load the dataframe
    df = pd.read_csv(path)

    # Split the data into human and GPT datasets
    dfs = dict(tuple(df.groupby('label')))
    Human_df = dfs[0].reset_index(drop=True)
    GPT_df = dfs[1].reset_index(drop=True)

    return Human_df, GPT_df

def preprocess_dataframe(df):
    # Make a copy to avoid warnings and unintended modifications
    df_copy = df.copy()

    # Rename the column
    df_copy.rename(columns={"answer": "text"}, inplace=True)

    # Drop unwanted columns
    df_copy.drop(['id', 'question'], axis=1, inplace=True)

    # Filter out rows with less than 10 tokens
    df_copy = df_copy[df_copy['text'].str.split().str.len() >= 10]

    return df_copy

def split_train_val(df):
    # Sample 7000 rows for validation data
    df_val = df.sample(n=7000, random_state=42)

    # Remove these rows from the original data to avoid overlap
    df_train = df.drop(df_val.index).reset_index(drop=True)

    return df_train, df_val

# Load and preprocess data
Human_df_train, GPT_df_train = load_and_preprocess_data('/content/drive/MyDrive/HC3 - filtered/en_train.csv')
Human_df_test, GPT_df_test = load_and_preprocess_data('/content/drive/MyDrive/HC3 - filtered/en_test.csv')

# Split training data into training and validation
Human_df_train, Human_df_val = split_train_val(Human_df_train)
GPT_df_train, GPT_df_val = split_train_val(GPT_df_train)

# Preprocess dataframes
Human_df_train = preprocess_dataframe(Human_df_train)
GPT_df_train = preprocess_dataframe(GPT_df_train)
Human_df_val = preprocess_dataframe(Human_df_val)
GPT_df_val = preprocess_dataframe(GPT_df_val)
Human_df_test = preprocess_dataframe(Human_df_test)
GPT_df_test = preprocess_dataframe(GPT_df_test)

In [None]:
# Add source columns
Human_df_train['source'] = 'train'
GPT_df_train['source'] = 'train'

Human_df_test['source'] = 'test'
GPT_df_test['source'] = 'test'



Human_df_val['source'] = 'val'
GPT_df_val['source'] = 'val'

# Concatenate all the datasets
GPT_df = pd.concat([GPT_df_train, GPT_df_test, GPT_df_val], ignore_index=True)
Human_df = pd.concat([Human_df_train, Human_df_test, Human_df_val], ignore_index=True)

In [None]:
import re
from tqdm import tqdm
tqdm.pandas()

class TextPreprocessor:
    def __init__(self):
        # Compile regex patterns for performance
        self.space_before_punct = re.compile(r'\s+([.,?!;:])')
        self.space_after_punct = re.compile(r'([.,?!;:])\s+')
        self.contractions = re.compile(r"(\w) n\'t")
        self.double_dashes = re.compile(r"\s*--\s*")
        self.hyphens = re.compile(r"\s+-\s+")
        self.single_quotes_start = re.compile(r"(\w)'(\s)")
        self.single_quotes_end = re.compile(r"(\s)'(\w)")
        self.mentions = re.compile(r'(@.*?)[\s]')
        self.links = re.compile(r'https?:\/\/[^\s\n\r]+')
        self.embedded_quotes = re.compile(r'"\s([^"]+)\s"')
        self.single_quotes_embedded = re.compile(r"'\s([^']+)\s'")
        self.space_after_open_parenthesis = re.compile(r'\(\s')
        self.space_before_close_parenthesis = re.compile(r'\s\)')
        self.multi_spaces = re.compile(r'\s+')

    def remove_unicode(self,text):
      return ''.join(char for char in text if ord(char) < 128)

    def preprocess(self, text):
        text = self.space_before_punct.sub(r'\1', text)
        text = self.space_after_punct.sub(r'\1 ', text)
        text = text.replace("\\'", "'")
        text = text.replace("\n", " ").replace("\\", "").replace('*', '')
        text = self.remove_unicode(text)
        text = self.contractions.sub(r"\1n't", text)
        text = self.double_dashes.sub("--", text)
        text = self.hyphens.sub("-", text)
        text = self.single_quotes_start.sub(r"\1'", text)
        text = self.single_quotes_end.sub(r" '\1", text)
        text = self.mentions.sub(' ', text)
        text = self.links.sub(' ', text)
        text = text.replace('#', ' ').replace("&amp;", "&")
        text = self.embedded_quotes.sub(r'"\1"', text)
        text = self.single_quotes_embedded.sub(r"\1", text)
        text = self.space_after_open_parenthesis.sub('(', text)
        text = self.space_before_close_parenthesis.sub(')', text)
        text = self.multi_spaces.sub(' ', text)
        return text


preprocessor = TextPreprocessor()
GPT_df.text = GPT_df.text.progress_map(preprocessor.preprocess)
Human_df.text = Human_df.text.progress_map(preprocessor.preprocess)

100%|██████████| 26752/26752 [00:07<00:00, 3712.53it/s]
100%|██████████| 53857/53857 [00:10<00:00, 5063.87it/s]


In [None]:
gpt_concatenated_string = ' '.join(GPT_df['text'])

human_concatenated_string = ' '.join(Human_df['text'])

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


bat


In [None]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2.14.

In [None]:
from torch.utils.data import Dataset
import numpy as np
import re
import nltk
import string

class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=500):
        texts = dataframe.text.values.tolist()
        #texts = [self._preprocess(text) for text in texts]
        self._print_random_samples(texts)

        self.texts = tokenizer(texts,
                              add_special_tokens=True,
                              max_length=max_length,
                              padding='max_length',
                              return_token_type_ids=True,  # Include this only if necessary for your task/model
                              truncation=True,
                              return_tensors="pt")

        if 'label' in dataframe:
            self.labels = dataframe.label.values.tolist()

    def _print_random_samples(self, texts):
        random_entries = np.random.choice(len(texts), 5, replace=False)
        for i in random_entries:
            print(f"Entry {i}: {texts[i]}")
        print()

    #def _preprocess(self, text):
     #   text = preprocessor(text)  # Assuming preprocessor function is defined elsewhere
      #  return text

    def __len__(self):
        return len(self.texts['input_ids'])

    def __getitem__(self, idx):
        text = {'input_ids': self.texts['input_ids'][idx],
                'attention_mask': self.texts['attention_mask'][idx]}
        label = -1
        if hasattr(self, 'labels'):
            label = self.labels[idx]
        return text, label

In [None]:
from torch import nn

class Classifier(nn.Module):
    def __init__(self, base_model):
        super(Classifier, self).__init__()

        self.bert = base_model
        self.fc1 = nn.Linear(768, 32)
        self.fc2 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids,
                             attention_mask=attention_mask)[0][:, 0]
        x = self.fc1(bert_out)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.sigmoid(x)

        return x

In [None]:
import torch
from torch.optim import AdamW
from tqdm import tqdm

def train(model, train_dataloader, val_dataloader, learning_rate, epochs):
    best_val_loss = float('inf')
    early_stopping_threshold_count = 0


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    model = model.to(device)
    criterion = criterion.to(device)

    for epoch in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_input, train_label in tqdm(train_dataloader):
            attention_mask = train_input['attention_mask'].to(device)
            input_ids = train_input['input_ids'].squeeze(1).to(device)

            train_label = train_label.to(device)

            output = model(input_ids, attention_mask)

            loss = criterion(output, train_label.float().unsqueeze(1))

            total_loss_train += loss.item()

            acc = ((output >= 0.5).int() == train_label.unsqueeze(1)).sum().item()
            total_acc_train += acc

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        with torch.no_grad():
            total_acc_val = 0
            total_loss_val = 0

            model.eval()

            for val_input, val_label in tqdm(val_dataloader):
                attention_mask = val_input['attention_mask'].to(device)
                input_ids = val_input['input_ids'].squeeze(1).to(device)

                val_label = val_label.to(device)

                output = model(input_ids, attention_mask)

                loss = criterion(output, val_label.float().unsqueeze(1))

                total_loss_val += loss.item()

                acc = ((output >= 0.5).int() == val_label.unsqueeze(1)).sum().item()
                total_acc_val += acc

            print(f'Epochs: {epoch + 1} '
                  f'| Train Loss: {total_loss_train / len(train_dataloader): .3f} '
                  f'| Train Accuracy: {total_acc_train / (len(train_dataloader.dataset)): .3f} '
                  f'| Val Loss: {total_loss_val / len(val_dataloader): .3f} '
                  f'| Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}')

            if best_val_loss > total_loss_val:
                best_val_loss = total_loss_val
                torch.save(model, f"best_model.pt")
                print("Saved model")
                early_stopping_threshold_count = 0
            else:
                early_stopping_threshold_count += 1

            if early_stopping_threshold_count >= 1:
                print("Early stopping")
                break

In [None]:
import torch
import gc

# Empty the cache
torch.cuda.empty_cache()

# Manually collect garbage
gc.collect()

42

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader

# Set random seeds
torch.manual_seed(0)
np.random.seed(0)

# BERT model definition
BERT_MODEL = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
base_model = AutoModel.from_pretrained(BERT_MODEL)

# Concatenate dataframes
df = pd.concat([Human_df, GPT_df])

# Split dataframes into train and test
df_train = df[df['source'] == 'train']
df_val = df[df['source'] == 'val']

# Initialize data loaders
train_dataloader = DataLoader(TextDataset(df_train, tokenizer), batch_size=16, shuffle=True, num_workers=0)
val_dataloader = DataLoader(TextDataset(df_val, tokenizer), batch_size=16, num_workers=0)

# Initialize classifier model
model = Classifier(base_model)  # Assuming Classifier class is defined elsewhere

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Entry 32129: Idaho is famous for its potatoes because the state has ideal growing conditions for potatoes. The soil in Idaho is rich and fertile, and the state's climate is perfect for growing potatoes. Idaho also has a long history of potato farming, which has helped to make it well-known as a source of high-quality potatoes. In addition, the state has done a good job of promoting its potatoes through marketing and branding efforts, which has helped to increase awareness of Idaho potatoes around the country and the world. So, Idaho is famous for its potatoes because it has the right conditions for growing them, and because it has worked hard to promote its potatoes to the public.
Entry 9020: They e designed to hold up our abdomens, and that about it. We ' e supposed to use out legs and arms for strength
Entry 35572: If you were to jump into a pool of jello, you would most likely sink to the bottom because jello is a type of soft, solid food that is made from gelatin and often has frui

In [None]:
# Training parameters
learning_rate = 1e-5
epochs = 1

# Train model
train(model, train_dataloader, val_dataloader, learning_rate, epochs)  # Assuming train function is defined elsewhere

100%|██████████| 2669/2669 [19:40<00:00,  2.26it/s]
100%|██████████| 860/860 [02:04<00:00,  6.92it/s]


Epochs: 1 | Train Loss:  0.035 | Train Accuracy:  0.988 | Val Loss:  0.035 | Val Accuracy:  0.990
Saved model


In [None]:
def get_text_predictions(model, loader):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    model = model.to(device)


    results_predictions = []
    with torch.no_grad():
        model.eval()
        for data_input, _ in tqdm(loader):
            attention_mask = data_input['attention_mask'].to(device)
            input_ids = data_input['input_ids'].squeeze(1).to(device)


            output = model(input_ids, attention_mask)

            output = (output > 0.5).int()
            results_predictions.append(output)

    return torch.cat(results_predictions).cpu().detach().numpy()

In [None]:
import torch
torch.save(model, "/content/drive/MyDrive/Project/roberta_gpt3_filtered.pt")


In [None]:
#model = torch.load("/content/drive/MyDrive/Project/roberta_gpt3_unfiltered.pt",map_location=torch.device('cuda') )

#test_dataloader = DataLoader(Dataset(web_df.iloc[0:10], tokenizer),
#	batch_size=8, shuffle=False, num_workers=0)

In [None]:
from sklearn.metrics import classification_report

df_test= df[df['source'] == 'test']
test_dataset = TextDataset(df_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0)

# 1. Extract true labels
true_labels = []
for _, labels in test_dataloader:
    if isinstance(labels, torch.Tensor):
        labels = labels.numpy()
    true_labels.extend(labels)


predictions = get_text_predictions(model, test_dataloader)
# Flatten predictions if they are batched
predictions = [item for sublist in predictions for item in sublist] if isinstance(predictions[0], (list, np.ndarray)) else predictions

# 2. Generate classification report
report = classification_report(true_labels, predictions)
print(report)

Entry 13768: lay down, put a blanket on you, now another blanket, now another one. Eventually you l be like "shit this is heavy". That because all of those objects are pushing down on you. Water does the same thing, air does too but we ' e used to it.
Entry 16437: Diesel fuel and gasoline are two different types of fuel that are used in vehicles. Diesel fuel is a type of fuel that is made from crude oil and is used mainly in diesel engines. Gasoline is also made from crude oil, but it is a different type of fuel that is used mainly in gasoline engines. The reason why gas stations have "caution diesel"signs on the pumps is because diesel fuel is not suitable for use in gasoline engines. If someone accidentally puts diesel fuel into a gasoline engine, it can cause serious damage to the engine. The"caution diesel" signs are there to remind people to make sure they are putting the right type of fuel into their vehicle. Gasoline engines and diesel engines are designed differently, and they 

100%|██████████| 3020/3020 [03:39<00:00, 13.79it/s]


              precision    recall  f1-score   support

           0       1.00      0.98      0.99     16148
           1       0.96      1.00      0.98      8012

    accuracy                           0.99     24160
   macro avg       0.98      0.99      0.98     24160
weighted avg       0.99      0.99      0.99     24160

