In [1]:
!pip install transformers sentence-transformers

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [5]:
def get_essay_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    ESSAY_DATA_PATH = PATH + "essay/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs"]
    docs, names, labels = [], [], []

    for split in required_data:
        for filename in os.listdir(ESSAY_DATA_PATH + split):
            filepath = os.path.join(ESSAY_DATA_PATH, split, filename)

            if filename in SKIP:
                continue
                
            with open(filepath, encoding="utf8") as f:
                doc = f.read()

            doc = doc.split()[:words_count]
            doc = " ".join(doc)

            filepath = filepath.replace(PATH, "")

            label = 0
            if split == "human":
                label = 1

            if verbose:
                print(filepath)
                
            names.append(filepath)
            docs.append(doc)
            labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [6]:
import pandas as pd

res = get_essay_data()

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(4332, 3)

In [7]:
df.head()

Unnamed: 0,txt,filename,label
0,Introduction Having children is a crucial aspe...,essay/data/gpt\0.txt,0
1,"During the Victorian era in England, women wer...",essay/data/gpt\1.txt,0
2,Enzymes are essential biological catalysts tha...,essay/data/gpt\10.txt,0
3,Memorials represent a crucial part of cultural...,essay/data/gpt\100.txt,0
4,English language has three functional categori...,essay/data/gpt\1000.txt,0


In [8]:
df.label.value_counts()

0    2888
1    1444
Name: label, dtype: int64

In [9]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors


0       [101, 4955, 2383, 2336, 2003, 1037, 10232, 781...
1       [101, 2076, 1996, 6652, 3690, 1999, 2563, 1010...
2       [101, 16285, 2024, 6827, 6897, 16771, 2015, 20...
3       [101, 22899, 5050, 1037, 10232, 2112, 1997, 34...
4       [101, 2394, 2653, 2038, 2093, 8360, 7236, 2008...
                              ...                        
4327    [101, 20253, 1996, 25022, 13181, 2368, 1039, 2...
4328    [101, 2653, 1998, 2554, 2024, 6171, 6970, 2107...
4329    [101, 2653, 2003, 1996, 3375, 2291, 1997, 4807...
4330    [101, 8106, 1998, 8107, 2000, 5702, 5907, 2098...
4331    [101, 5418, 8035, 5218, 2000, 1996, 2817, 1997...
Name: txt, Length: 4332, dtype: object

In [11]:
tokenized = df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [12]:
tokenized.shape, len(tokenized)

((4332,), 4332)

In [13]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [14]:
np.array(padded).shape

(4332, 876)

In [15]:
padded

array([[  101,  4955,  2383, ...,     0,     0,     0],
       [  101,  2076,  1996, ...,     0,     0,     0],
       [  101, 16285,  2024, ...,     0,     0,     0],
       ...,
       [  101,  2653,  2003, ...,     0,     0,     0],
       [  101,  8106,  1998, ...,     0,     0,     0],
       [  101,  5418,  8035, ...,     0,     0,     0]])

In [16]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(4332, 876)

In [17]:
attention_mask[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [20]:
input_ids.shape, attention_mask.shape

(torch.Size([4332, 876]), torch.Size([4332, 876]))

In [22]:
input_ids = torch.tensor(padded)
# attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features