In [11]:
import pandas as pd
Human_df_train = pd.read_json(path_or_buf="/content/drive/MyDrive/data/webtext.train.jsonl", lines=True)
GPT_df_train = pd.read_json(path_or_buf="/content/drive/MyDrive/data/medium-345M-k40.train.jsonl", lines=True)

Human_df_test = pd.read_json(path_or_buf="/content/drive/MyDrive/data/webtext.test.jsonl", lines=True)
GPT_df_test = pd.read_json(path_or_buf="/content/drive/MyDrive/data/medium-345M-k40.test.jsonl", lines=True)

Human_df_val = pd.read_json(path_or_buf="/content/drive/MyDrive/data/webtext.valid.jsonl", lines=True)
GPT_df_val = pd.read_json(path_or_buf="/content/drive/MyDrive/data/medium-345M-k40.valid.jsonl", lines=True)

In [2]:
def add_label_column(df, label):
    df['label'] = label
    return df

# Preprocess datasets
GPT_df_train = add_label_column(GPT_df_train, 1)
GPT_df_test = add_label_column(GPT_df_test, 1)
GPT_df_val = add_label_column(GPT_df_val, 1)

Human_df_train = add_label_column(Human_df_train, 0)
Human_df_test = add_label_column(Human_df_test, 0)
Human_df_val = add_label_column(Human_df_val, 0)

In [3]:
def remove_short_texts(df, lenght):
  return df[df['length'] >= lenght]

# Add source columns
Human_df_train['source'] = 'train'
GPT_df_train['source'] = 'train'

Human_df_train = remove_short_texts(Human_df_train, 10).sample(25000,random_state=42)
GPT_df_train = remove_short_texts(GPT_df_train, 10).sample(25000,random_state=42)


Human_df_test['source'] = 'test'
GPT_df_test['source'] = 'test'



Human_df_val['source'] = 'val'
GPT_df_val['source'] = 'val'

# Concatenate all the datasets
GPT_df = pd.concat([GPT_df_train, GPT_df_test, GPT_df_val], ignore_index=True)
Human_df = pd.concat([Human_df_train, Human_df_test, Human_df_val], ignore_index=True)

In [4]:
GPT_df = GPT_df.drop(['id', 'length', 'ended'], axis=1)
Human_df = Human_df.drop(['id', 'length', 'ended'], axis=1)

In [5]:
import re
from tqdm import tqdm
tqdm.pandas()

class TextPreprocessor:
    def __init__(self):
        # Compile regex patterns for performance
        self.space_before_punct = re.compile(r'\s+([.,?!;:])')
        self.space_after_punct = re.compile(r'([.,?!;:])\s+')
        self.contractions = re.compile(r"(\w) n\'t")
        self.double_dashes = re.compile(r"\s*--\s*")
        self.hyphens = re.compile(r"\s+-\s+")
        self.single_quotes_start = re.compile(r"(\w)'(\s)")
        self.single_quotes_end = re.compile(r"(\s)'(\w)")
        self.mentions = re.compile(r'(@.*?)[\s]')
        self.links = re.compile(r'https?:\/\/[^\s\n\r]+')
        self.embedded_quotes = re.compile(r'"\s([^"]+)\s"')
        self.single_quotes_embedded = re.compile(r"'\s([^']+)\s'")
        self.space_after_open_parenthesis = re.compile(r'\(\s')
        self.space_before_close_parenthesis = re.compile(r'\s\)')
        self.multi_spaces = re.compile(r'\s+')

    def remove_unicode(self,text):
      return ''.join(char for char in text if ord(char) < 128)

    def preprocess(self, text):
        text = self.space_before_punct.sub(r'\1', text)
        text = self.space_after_punct.sub(r'\1 ', text)
        text = text.replace("\\'", "'")
        text = text.replace("\n", " ").replace("\\", "").replace('*', '')
        text = self.remove_unicode(text)
        text = self.contractions.sub(r"\1n't", text)
        text = self.double_dashes.sub("--", text)
        text = self.hyphens.sub("-", text)
        text = self.single_quotes_start.sub(r"\1'", text)
        text = self.single_quotes_end.sub(r" '\1", text)
        text = self.mentions.sub(' ', text)
        text = self.links.sub(' ', text)
        text = text.replace('#', ' ').replace("&amp;", "&")
        text = self.embedded_quotes.sub(r'"\1"', text)
        text = self.single_quotes_embedded.sub(r"\1", text)
        text = self.space_after_open_parenthesis.sub('(', text)
        text = self.space_before_close_parenthesis.sub(')', text)
        text = self.multi_spaces.sub(' ', text)
        return text


preprocessor = TextPreprocessor()
GPT_df.text = GPT_df.text.progress_map(preprocessor.preprocess)
Human_df.text = Human_df.text.progress_map(preprocessor.preprocess)

100%|██████████| 35000/35000 [00:41<00:00, 842.85it/s]
100%|██████████| 35000/35000 [00:34<00:00, 1029.06it/s]


In [1]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [17]:
from torch.utils.data import Dataset
import numpy as np
import re
import nltk
import string

class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=500):
        texts = dataframe.text.values.tolist()
        #texts = [self._preprocess(text) for text in texts]
        self._print_random_samples(texts)

        self.texts = tokenizer(texts, padding='max_length',
                                max_length=max_length,
                                truncation=True,
                              return_tensors="pt")

        if 'label' in dataframe:
            self.labels = dataframe.label.values.tolist()

    def _print_random_samples(self, texts):
        random_entries = np.random.choice(len(texts), 5, replace=False)
        for i in random_entries:
            print(f"Entry {i}: {texts[i]}")
        print()

    #def _preprocess(self, text):
     #   text = preprocessor(text)  # Assuming preprocessor function is defined elsewhere
      #  return text

    def __len__(self):
        return len(self.texts['input_ids'])

    def __getitem__(self, idx):
        text = {'input_ids': self.texts['input_ids'][idx],
                'attention_mask': self.texts['attention_mask'][idx]}
        label = -1
        if hasattr(self, 'labels'):
            label = self.labels[idx]
        return text, label

In [8]:
from torch import nn

class Classifier(nn.Module):
    def __init__(self, base_model):
        super(Classifier, self).__init__()

        self.bert = base_model
        self.fc1 = nn.Linear(768, 32)
        self.fc2 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids,
                             attention_mask=attention_mask)[0][:, 0]
        x = self.fc1(bert_out)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.sigmoid(x)

        return x

In [13]:
import torch
from torch.optim import AdamW
from tqdm import tqdm

def train(model, train_dataloader, val_dataloader, learning_rate, epochs):
    best_val_loss = float('inf')
    early_stopping_threshold_count = 0


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    model = model.to(device)
    criterion = criterion.to(device)

    for epoch in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_input, train_label in tqdm(train_dataloader):
            attention_mask = train_input['attention_mask'].to(device)
            input_ids = train_input['input_ids'].squeeze(1).to(device)

            train_label = train_label.to(device)

            output = model(input_ids, attention_mask)

            loss = criterion(output, train_label.float().unsqueeze(1))

            total_loss_train += loss.item()

            acc = ((output >= 0.5).int() == train_label.unsqueeze(1)).sum().item()
            total_acc_train += acc

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        with torch.no_grad():
            total_acc_val = 0
            total_loss_val = 0

            model.eval()

            for val_input, val_label in tqdm(val_dataloader):
                attention_mask = val_input['attention_mask'].to(device)
                input_ids = val_input['input_ids'].squeeze(1).to(device)

                val_label = val_label.to(device)

                output = model(input_ids, attention_mask)

                loss = criterion(output, val_label.float().unsqueeze(1))

                total_loss_val += loss.item()

                acc = ((output >= 0.5).int() == val_label.unsqueeze(1)).sum().item()
                total_acc_val += acc

            print(f'Epochs: {epoch + 1} '
                  f'| Train Loss: {total_loss_train / len(train_dataloader): .3f} '
                  f'| Train Accuracy: {total_acc_train / (len(train_dataloader.dataset)): .3f} '
                  f'| Val Loss: {total_loss_val / len(val_dataloader): .3f} '
                  f'| Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}')

            if best_val_loss > total_loss_val:
                best_val_loss = total_loss_val
                torch.save(model, f"best_model.pt")
                print("Saved model")
                early_stopping_threshold_count = 0
            else:
                early_stopping_threshold_count += 1

            if early_stopping_threshold_count >= 1:
                print("Early stopping")
                break

In [21]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader

# Set random seeds
torch.manual_seed(0)
np.random.seed(0)

# BERT model definition
BERT_MODEL = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
base_model = AutoModel.from_pretrained(BERT_MODEL)

# Concatenate dataframes
df = pd.concat([Human_df, GPT_df])

# Split dataframes into train and test
df_train = df[df['source'] == 'train']
df_val = df[df['source'] == 'val']


# Initialize data loaders
train_dataloader = DataLoader(TextDataset(df_train, tokenizer), batch_size=8, shuffle=True, num_workers=0)
val_dataloader = DataLoader(TextDataset(df_val, tokenizer), batch_size=8, num_workers=0)

# Initialize classifier model
model = Classifier(base_model)  # Assuming Classifier class is defined elsewhere



Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [31]:
# Training parameters
learning_rate = 1e-5
epochs = 1

# Train model
train(model, train_dataloader, val_dataloader, learning_rate, epochs)  # Assuming train function is defined elsewhere

100%|██████████| 6250/6250 [24:51<00:00,  4.19it/s]
100%|██████████| 1250/1250 [01:35<00:00, 13.08it/s]


Epochs: 1 | Train Loss:  0.035 | Train Accuracy:  0.989 | Val Loss:  0.074 | Val Accuracy:  0.976
Saved model


In [16]:
def get_text_predictions(model, loader):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    model = model.to(device)


    results_predictions = []
    with torch.no_grad():
        model.eval()
        for data_input, _ in tqdm(loader):
            attention_mask = data_input['attention_mask'].to(device)
            input_ids = data_input['input_ids'].squeeze(1).to(device)


            output = model(input_ids, attention_mask)

            output = (output > 0.5).int()
            results_predictions.append(output)

    return torch.cat(results_predictions).cpu().detach().numpy()

In [3]:
import torch
#torch.save(model, "/content/drive/MyDrive/Project/roberta_gpt2.pt")


In [9]:
model = torch.load("/content/drive/MyDrive/Project/roberta_gpt2.pt", map_location=torch.device('cuda'))

#test_dataloader = DataLoader(Dataset(web_df.iloc[0:10], tokenizer),
#	batch_size=8, shuffle=False, num_workers=0)

In [22]:
df_test= df[df['source'] == 'test']

test_dataloader = DataLoader(TextDataset(df_test, tokenizer), batch_size=8, shuffle=False, num_workers=0)

Entry 9394: This is a rush transcript from "Hannity," April 15, 2011. This copy may not be in its final form and may be updated. SEAN HANNITY, HOST: Tonight is part two of my interview with potential 2012 presidential candidate Donald Trump who has been making headlines for the controversial rebukes that he has issued President Obama. And that's where we begin tonight. Let's take a look. (BEGIN VIDEO CLIP) HANNITY: A lot has been made over the birth certificate issue. And you apparently, you have said in the previous interviews that you have a team of investigators in Hawaii now looking into it. DONALD TRUMP, BUSINESSMAN: Correct. HANNITY: It has a lot of press. Everyone is asking you about it. TRUMP: Right. HANNITY: And what have you come up with your investigators? TRUMP: Well, I don't want to say that now. But it is going to be very interesting. But I don't want to say it now, Sean. But I will say this, I don't love this issue. I'd much rather be talking about how China is ripping u

In [37]:
df_test.to_hdf("/content/drive/MyDrive/Project/gpt2_test.hd5",key='df', mode='w')

In [25]:
predictions = get_text_predictions(model, test_dataloader)

100%|██████████| 1250/1250 [01:35<00:00, 13.07it/s]


In [26]:
from sklearn.metrics import classification_report
print(classification_report(df_test.label, predictions))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98      5000
           1       0.96      1.00      0.98      5000

    accuracy                           0.98     10000
   macro avg       0.98      0.98      0.98     10000
weighted avg       0.98      0.98      0.98     10000

