In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_prompts_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    PROMPTS_DATA_PATH = PATH + "writing_prompts/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines", "prompts"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for filename in os.listdir(PROMPTS_DATA_PATH + split):
            filepath = os.path.join(PROMPTS_DATA_PATH, split, filename)

            if filename in SKIP:
                continue

            with open(filepath, encoding="utf8") as f:
                doc = f.read()

            doc = doc.split()[:words_count]
            doc = " ".join(doc)

            filepath = filepath.replace(PATH, "")

            label = 0
            if split == "human":
                label = 1

            if verbose:
                print(filepath)

            names.append(filepath)
            docs.append(doc)
            labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [3]:
import pandas as pd

res = get_prompts_data(words_count=250)

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(3000, 3)

In [4]:
df.head()

Unnamed: 0,txt,filename,label
0,"For years, I had been obsessed with finding th...",writing_prompts/data/gpt\0.txt,0
1,"For centuries, humans had gazed up at the glow...",writing_prompts/data/gpt\1.txt,0
2,The air was charged with excitement as fans al...,writing_prompts/data/gpt\10.txt,0
3,Tonight was unlike any other night. The air wa...,writing_prompts/data/gpt\100.txt,0
4,"As a guardian angel, Michael had seen his fair...",writing_prompts/data/gpt\101.txt,0


In [5]:
df.label.value_counts()

0    2000
1    1000
Name: label, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
batch_1 = df[:4000]
batch_2 = df[4000:8000]
batch_3 = df[8000:12000]
batch_4 = df[12000:]

In [8]:
batch_1 = df

In [9]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [10]:
batch_1.label.value_counts()

0    2000
1    1000
Name: label, dtype: int64

In [11]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
tokenized.shape, len(tokenized)

((3000,), 3000)

In [13]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [14]:
np.array(padded).shape

(3000, 512)

In [15]:
padded

array([[ 101, 2005, 2086, ...,    0,    0,    0],
       [ 101, 2005, 4693, ...,    0,    0,    0],
       [ 101, 1996, 2250, ...,    0,    0,    0],
       ...,
       [ 101, 1996, 7656, ...,    0,    0,    0],
       [ 101, 1996, 2214, ...,    0,    0,    0],
       [ 101, 1996, 2155, ...,    0,    0,    0]])

In [16]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(3000, 512)

In [17]:
attention_mask[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [18]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=539, microseconds=662275)

In [28]:
1590/60, 492/60

(26.5, 8.2)

In [19]:
features = last_hidden_states[0][:,0,:].numpy()

In [20]:
features

array([[ 0.11586548, -0.10085335, -0.4271329 , ..., -0.01041502,
         0.43675762,  0.46826863],
       [-0.03454538,  0.05461045, -0.3500889 , ...,  0.03545896,
         0.45765573,  0.36588472],
       [ 0.03203139, -0.36519843, -0.10803607, ...,  0.06615342,
         0.34676105,  0.27858904],
       ...,
       [-0.04181369, -0.02850407, -0.2638895 , ...,  0.16216603,
         0.28862384,  0.33571723],
       [ 0.23565632,  0.19447333, -0.159066  , ...,  0.10861845,
         0.61022586,  0.30205944],
       [-0.11254532,  0.10267524, -0.1335986 , ...,  0.01200475,
         0.29555306,  0.23108517]], dtype=float32)

In [21]:
len(features)

3000

In [22]:
len(batch_1.label)

3000

In [23]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [24]:
features[0]

array([ 1.15865484e-01, -1.00853346e-01, -4.27132905e-01, -8.79911631e-02,
       -2.67250359e-01, -1.92305222e-01,  2.01317728e-01,  1.42236173e-01,
        1.10973157e-01, -5.86412668e-01,  2.99415380e-01,  2.69023087e-02,
       -2.20094427e-01,  5.46052992e-01,  2.16907784e-01,  1.29838750e-01,
        2.16632456e-01,  1.00481413e-01,  1.00310810e-01,  3.99534285e-01,
       -2.98189186e-02,  5.34334183e-02,  9.43742543e-02,  4.86155242e-01,
       -2.11910844e-01, -1.33301884e-01, -1.08110413e-01, -1.38607889e-01,
        2.11086303e-01,  1.28616765e-01, -8.95413756e-02, -8.12625065e-02,
       -3.82508993e-01, -3.87919515e-01,  1.00999743e-01, -2.98453271e-01,
       -9.80200171e-02, -1.82729408e-01, -1.56083480e-01,  1.18555441e-01,
       -2.11575985e-01,  1.73408687e-01, -3.12379271e-01, -1.22431088e-02,
       -1.20817780e-01, -9.15805101e-02, -3.47406483e+00,  1.13190114e-01,
       -2.10556820e-01, -1.88209862e-02,  3.39024901e-01, -1.94087997e-01,
        2.56889254e-01,  

In [25]:
tmp_df.sentence_embeddings.iloc[0]

[0.11586548388004303,
 -0.1008533462882042,
 -0.42713290452957153,
 -0.08799116313457489,
 -0.2672503590583801,
 -0.19230522215366364,
 0.20131772756576538,
 0.14223617315292358,
 0.11097315698862076,
 -0.5864126682281494,
 0.29941537976264954,
 0.02690230868756771,
 -0.22009442746639252,
 0.5460529923439026,
 0.2169077843427658,
 0.1298387497663498,
 0.21663245558738708,
 0.10048141330480576,
 0.1003108099102974,
 0.39953428506851196,
 -0.02981891855597496,
 0.05343341827392578,
 0.09437425434589386,
 0.486155241727829,
 -0.21191084384918213,
 -0.13330188393592834,
 -0.10811041295528412,
 -0.13860788941383362,
 0.21108630299568176,
 0.12861676514148712,
 -0.08954137563705444,
 -0.08126250654459,
 -0.3825089931488037,
 -0.38791951537132263,
 0.10099974274635315,
 -0.29845327138900757,
 -0.09802001714706421,
 -0.18272940814495087,
 -0.15608347952365875,
 0.11855544149875641,
 -0.21157598495483398,
 0.17340868711471558,
 -0.31237927079200745,
 -0.012243108823895454,
 -0.12081778049468994

In [26]:
(tmp_df.sentence_embeddings.iloc[0] == features[0]).sum()

768

In [27]:
assert((tmp_df.sentence_embeddings.iloc[0] == features[0]).sum())

In [28]:
tmp_df.head(5)

Unnamed: 0,sentence_embeddings,label
0,"[0.11586548388004303, -0.1008533462882042, -0....",0
1,"[-0.034545376896858215, 0.05461045354604721, -...",0
2,"[0.03203139454126358, -0.36519843339920044, -0...",0
3,"[-0.08962065726518631, -0.19808100163936615, 0...",0
4,"[-0.19658051431179047, 0.05862117558717728, -0...",0


In [29]:
tmp_df.label.value_counts()

0    2000
1    1000
Name: label, dtype: int64

In [30]:
batch_1.label.value_counts()

0    2000
1    1000
Name: label, dtype: int64

In [31]:
import dill as pickle

In [None]:
import os

if not os.path.exists('features/'):
    os.mkdir('features/')

In [32]:
pickle.dump(tmp_df, open('features_250/prompts_sentence_embeddings', 'wb'))

In [34]:
12

12

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

In [None]:
len(features)

In [None]:
pickle.dump(features, open('features/reuters_sentence_embeddings_batch1', 'wb'))

In [None]:
ar = pickle.load(open('features/reuters_sentence_embeddings_batch1', 'rb'))
len(ar)

In [None]:
ar.head()

In [None]:
(features[0] == ar.sentence_embeddings[0]).sum()

In [None]:
set([len(i) for i in ar])

In [None]:
ar1 = pickle.load(open('features/essay_sentence_embeddings_batch2', 'rb'))

In [None]:
ar.shape, ar1.shape

In [None]:
pd.concat([ar, ar1]).label.value_counts()