In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [12]:
def get_prompts_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    PROMPTS_DATA_PATH = PATH + "writing_prompts/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines", "prompts"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for filename in os.listdir(PROMPTS_DATA_PATH + split):
            filepath = os.path.join(PROMPTS_DATA_PATH, split, filename)

            if filename in SKIP:
                continue

            with open(filepath, encoding="utf8") as f:
                doc = f.read()

            doc = doc.split()[:words_count]
            doc = " ".join(doc)

            filepath = filepath.replace(PATH, "")

            label = 0
            if split == "human":
                label = 1

            if verbose:
                print(filepath)

            names.append(filepath)
            docs.append(doc)
            labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [13]:
import pandas as pd

res = get_prompts_data()

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(3000, 3)

In [14]:
df.head()

Unnamed: 0,txt,filename,label
0,"For years, I had been obsessed with finding th...",writing_prompts/data/gpt\0.txt,0
1,"For centuries, humans had gazed up at the glow...",writing_prompts/data/gpt\1.txt,0
2,The air was charged with excitement as fans al...,writing_prompts/data/gpt\10.txt,0
3,Tonight was unlike any other night. The air wa...,writing_prompts/data/gpt\100.txt,0
4,"As a guardian angel, Michael had seen his fair...",writing_prompts/data/gpt\101.txt,0


In [15]:
df.label.value_counts()

0    2000
1    1000
Name: label, dtype: int64

In [16]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
batch_1 = df[:4000]
batch_2 = df[4000:8000]
batch_3 = df[8000:12000]
batch_4 = df[12000:]

In [17]:
batch_1 = df

In [18]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [19]:
batch_1.label.value_counts()

0    2000
1    1000
Name: label, dtype: int64

In [20]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [21]:
tokenized.shape, len(tokenized)

((3000,), 3000)

In [22]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [23]:
np.array(padded).shape

(3000, 512)

In [24]:
padded

array([[ 101, 2005, 2086, ..., 5378, 1012,  102],
       [ 101, 2005, 4693, ...,    0,    0,    0],
       [ 101, 1996, 2250, ..., 7274, 1005,  102],
       ...,
       [ 101, 1996, 7656, ...,    0,    0,    0],
       [ 101, 1996, 2214, ..., 1037, 3457,  102],
       [ 101, 1996, 2155, ..., 1996, 2422,  102]])

In [25]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(3000, 512)

In [26]:
attention_mask[:5]

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [27]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=510, microseconds=786166)

In [28]:
1590/60, 492/60

(26.5, 8.2)

In [29]:
features = last_hidden_states[0][:,0,:].numpy()

In [30]:
features

array([[ 1.45924613e-01,  3.92686874e-02, -3.55475843e-01, ...,
         4.92195673e-02,  5.02845883e-01,  3.90672237e-01],
       [-2.97042634e-02,  2.62294021e-02, -2.98153341e-01, ...,
        -7.38073140e-05,  5.74310422e-01,  3.16143513e-01],
       [ 4.24606279e-02, -3.84472609e-01, -1.11869141e-01, ...,
         1.37263373e-01,  3.79891455e-01,  3.10414523e-01],
       ...,
       [-6.65223673e-02, -3.39237303e-02, -2.75122225e-01, ...,
         1.36944115e-01,  2.71751523e-01,  3.22713256e-01],
       [ 1.96694747e-01,  1.66752517e-01, -1.17960304e-01, ...,
         1.35763466e-01,  5.54251254e-01,  3.34941000e-01],
       [-1.57564715e-01,  8.16034302e-02, -1.68788582e-01, ...,
         2.51024105e-02,  3.50282490e-01,  2.17348412e-01]], dtype=float32)

In [31]:
len(features)

3000

In [32]:
len(batch_1.label)

3000

In [33]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [34]:
features[0]

array([ 1.45924613e-01,  3.92686874e-02, -3.55475843e-01, -1.14469573e-01,
       -1.95143536e-01, -2.15600237e-01,  7.92468339e-02,  1.03393011e-01,
        8.99168849e-02, -6.25602067e-01,  1.79457352e-01,  7.08032697e-02,
       -2.90978253e-01,  4.30253148e-01,  1.51028439e-01,  1.19464286e-01,
        6.29654527e-02,  6.09513149e-02,  1.09756276e-01,  4.20775831e-01,
        9.47500989e-02, -3.99672166e-02,  2.55861342e-01,  4.61340040e-01,
       -1.37217864e-01, -1.65214408e-02, -9.95714962e-02, -8.92331898e-02,
        1.13552295e-01,  9.12581608e-02,  2.03354610e-03, -9.61766094e-02,
       -3.73738021e-01, -5.03334165e-01,  2.82928236e-02, -2.32496023e-01,
       -1.86174676e-01, -2.31153965e-01, -2.08308220e-01,  1.24316216e-01,
       -1.68477207e-01,  2.61019200e-01, -1.33059755e-01, -1.00349642e-01,
       -1.99086398e-01, -1.87606022e-01, -3.65469861e+00,  7.74322450e-03,
       -1.41000628e-01, -2.71969810e-02,  4.30198789e-01, -9.36675891e-02,
        7.89867528e-03,  

In [35]:
tmp_df.sentence_embeddings.iloc[0]

[0.1459246128797531,
 0.03926868736743927,
 -0.35547584295272827,
 -0.11446957290172577,
 -0.19514353573322296,
 -0.21560023725032806,
 0.07924683392047882,
 0.10339301079511642,
 0.0899168848991394,
 -0.6256020665168762,
 0.17945735156536102,
 0.07080326974391937,
 -0.29097825288772583,
 0.43025314807891846,
 0.15102843940258026,
 0.11946428567171097,
 0.06296545267105103,
 0.060951314866542816,
 0.10975627601146698,
 0.420775830745697,
 0.09475009888410568,
 -0.039967216551303864,
 0.2558613419532776,
 0.4613400399684906,
 -0.13721786439418793,
 -0.01652144081890583,
 -0.09957149624824524,
 -0.08923318982124329,
 0.11355229467153549,
 0.09125816076993942,
 0.0020335461013019085,
 -0.09617660939693451,
 -0.37373802065849304,
 -0.5033341646194458,
 0.02829282358288765,
 -0.23249602317810059,
 -0.186174675822258,
 -0.2311539649963379,
 -0.20830821990966797,
 0.12431621551513672,
 -0.16847720742225647,
 0.26101920008659363,
 -0.13305975496768951,
 -0.10034964233636856,
 -0.19908639788627

In [36]:
(tmp_df.sentence_embeddings.iloc[0] == features[0]).sum()

768

In [40]:
assert((tmp_df.sentence_embeddings.iloc[0] == features[0]).sum())

In [41]:
tmp_df.head(5)

Unnamed: 0,sentence_embeddings,label
0,"[0.1459246128797531, 0.03926868736743927, -0.3...",0
1,"[-0.02970426343381405, 0.02622940205037594, -0...",0
2,"[0.04246062785387039, -0.3844726085662842, -0....",0
3,"[-0.11057035624980927, -0.17779888212680817, 0...",0
4,"[-0.2309124618768692, 0.02910764329135418, -0....",0


In [42]:
tmp_df.label.value_counts()

0    2000
1    1000
Name: label, dtype: int64

In [43]:
batch_1.label.value_counts()

0    2000
1    1000
Name: label, dtype: int64

In [45]:
import dill as pickle

In [None]:
import os

if not os.path.exists('features/'):
    os.mkdir('features/')

In [46]:
pickle.dump(tmp_df, open('features/prompts_sentence_embeddings', 'wb'))

In [34]:
12

12

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

In [None]:
len(features)

In [None]:
pickle.dump(features, open('features/reuters_sentence_embeddings_batch1', 'wb'))

In [None]:
ar = pickle.load(open('features/reuters_sentence_embeddings_batch1', 'rb'))
len(ar)

In [None]:
ar.head()

In [None]:
(features[0] == ar.sentence_embeddings[0]).sum()

In [None]:
set([len(i) for i in ar])

In [None]:
ar1 = pickle.load(open('features/essay_sentence_embeddings_batch2', 'rb'))

In [None]:
ar.shape, ar1.shape

In [None]:
pd.concat([ar, ar1]).label.value_counts()