In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_essay_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    ESSAY_DATA_PATH = PATH + "essay/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs"]
    docs, names, labels = [], [], []

    for split in required_data:
        for filename in os.listdir(ESSAY_DATA_PATH + split):
            filepath = os.path.join(ESSAY_DATA_PATH, split, filename)

            if filename in SKIP:
                continue
                
            with open(filepath, encoding="utf8") as f:
                doc = f.read()

            doc = doc.split()[:words_count]
            doc = " ".join(doc)

            filepath = filepath.replace(PATH, "")

            label = 0
            if split == "human":
                label = 1

            if verbose:
                print(filepath)
                
            names.append(filepath)
            docs.append(doc)
            labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [3]:
import pandas as pd

res = get_essay_data()

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(4001, 3)

In [4]:
df.head()

Unnamed: 0,txt,filename,label
0,Introduction Having children is a crucial aspe...,essay/data/gpt\0.txt,0
1,"During the Victorian era in England, women wer...",essay/data/gpt\1.txt,0
2,Enzymes are essential biological catalysts tha...,essay/data/gpt\10.txt,0
3,Memorials represent a crucial part of cultural...,essay/data/gpt\100.txt,0
4,English language has three functional categori...,essay/data/gpt\1000.txt,0


In [5]:
df.label.value_counts()

0    2557
1    1444
Name: label, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
batch_1 = df[:4000]

In [8]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [9]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
tokenized.shape, len(tokenized)

((4000,), 4000)

In [11]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [12]:
np.array(padded).shape

(4000, 512)

In [13]:
padded

array([[  101,  4955,  2383, ..., 10908,  1010,   102],
       [  101,  2076,  1996, ...,  1010,  2005,   102],
       [  101, 16285,  2024, ...,  5377,  1006,   102],
       ...,
       [  101, 10469,  3399, ...,  7339,  1012,   102],
       [  101,  1996,  9099, ..., 11578,  1012,   102],
       [  101,  5292, 14545, ...,  1997,  2128,   102]])

In [14]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(4000, 512)

In [15]:
attention_mask[:5]

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [16]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=1839, microseconds=781123)

In [17]:
1839/60

30.65

In [18]:
features = last_hidden_states[0][:,0,:].numpy()

In [19]:
features

array([[-0.21086547, -0.00548651, -0.44475728, ..., -0.40019983,
         0.18505082,  0.46580416],
       [-0.3340324 ,  0.12804615, -0.66982174, ..., -0.20458105,
         0.19250229,  0.60319567],
       [-0.4694301 , -0.14065003, -0.20904969, ..., -0.23931995,
         0.26219547,  0.5047539 ],
       ...,
       [-0.6415462 , -0.17181583, -0.6923587 , ..., -0.19202165,
         0.45712525,  0.52490366],
       [-0.37604716, -0.03995685, -0.22739239, ..., -0.37152255,
         0.46510708,  0.33437642],
       [-0.32325706, -0.08844053, -0.14828393, ..., -0.10263932,
         0.1688457 ,  0.55730915]], dtype=float32)

In [20]:
len(features)

4000

In [21]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [22]:
(tmp_df.sentence_embeddings.iloc[0] == features[0]).sum()

768

In [23]:
tmp_df.head(5)

Unnamed: 0,sentence_embeddings,label
0,"[-0.21086546778678894, -0.005486507900059223, ...",0
1,"[-0.3340323865413666, 0.1280461549758911, -0.6...",0
2,"[-0.4694300889968872, -0.14065003395080566, -0...",0
3,"[-0.19011789560317993, 0.16007745265960693, -0...",0
4,"[-0.5151359438896179, -0.09672432392835617, -0...",0


In [24]:
tmp_df.label.value_counts()

0    2556
1    1444
Name: label, dtype: int64

In [25]:
import dill as pickle

pickle.dump(tmp_df, open('features/essay_sentence_embeddings_batch1', 'wb'))