In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
for i in range(100):
    path = f"writing_prompts/data/human/{i}.txt"
    with open(path, encoding='utf8') as f:
        txt = f.read()
    
    if len(txt.split()) > 1500:
        print('found')
        break

In [None]:
len(txt.split())

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
tokenizer.encode(txt, add_special_tokens=True, max_length=2200)

In [None]:
from transformers import BertTokenizer
tz = BertTokenizer.from_pretrained("bert-base-cased")
tz.convert_tokens_to_ids(["characteristically"])

In [None]:
sent = "He remains characteristically confident and optimistic."
tz.tokenize(sent)

In [None]:
tz.convert_tokens_to_ids(tz.tokenize(sent))

In [None]:
sent = txt
len(tz.tokenize(sent))

In [None]:
len(tz.convert_tokens_to_ids(tz.tokenize(sent)))

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_prompts_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    PROMPTS_DATA_PATH = PATH + "writing_prompts/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines", "prompts"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for filename in os.listdir(PROMPTS_DATA_PATH + split):
            filepath = os.path.join(PROMPTS_DATA_PATH, split, filename)

            if filename in SKIP:
                continue

            with open(filepath, encoding="utf8") as f:
                doc = f.read()

            doc = doc.split()[:words_count]
            doc = " ".join(doc)

            filepath = filepath.replace(PATH, "")

            label = 0
            if split == "human":
                label = 1

            if verbose:
                print(filepath)

            names.append(filepath)
            docs.append(doc)
            labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [3]:
import pandas as pd

res = get_prompts_data()

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(3000, 3)

In [4]:
df.head()

Unnamed: 0,txt,filename,label
0,"For years, I had been obsessed with finding th...",writing_prompts/data/gpt\0.txt,0
1,"For centuries, humans had gazed up at the glow...",writing_prompts/data/gpt\1.txt,0
2,The air was charged with excitement as fans al...,writing_prompts/data/gpt\10.txt,0
3,Tonight was unlike any other night. The air wa...,writing_prompts/data/gpt\100.txt,0
4,"As a guardian angel, Michael had seen his fair...",writing_prompts/data/gpt\101.txt,0


In [5]:
df.label.value_counts()

0    2000
1    1000
Name: label, dtype: int64

In [6]:
from transformers import BertTokenizer
tz = BertTokenizer.from_pretrained("bert-base-cased")
tz.convert_tokens_to_ids(["characteristically"])

[100]

In [8]:
tokenized = df.txt.apply(lambda x: tz.convert_tokens_to_ids(tz.tokenize(x)))

In [9]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [10]:
np.array(padded).shape

(3000, 903)

In [11]:
padded

array([[ 1370,  1201,   117, ...,     0,     0,     0],
       [ 1370,  3944,   117, ...,     0,     0,     0],
       [ 1109,  1586,  1108, ...,     0,     0,     0],
       ...,
       [ 1109, 10801,   172, ...,     0,     0,     0],
       [ 1109,  1385, 24664, ...,     0,     0,     0],
       [ 1109,  3921, 14009, ...,     0,     0,     0]])

In [12]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(3000, 903)

In [13]:
attention_mask[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [14]:
ppb.BertModel

transformers.models.bert.modeling_bert.BertModel

In [15]:
model_class = ppb.BertModel
model = model_class.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start1

RuntimeError: The expanded size of the tensor (903) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [3000, 903].  Tensor sizes: [1, 512]

# SBERT

In [17]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [22]:
sbert_tokenized = model.encode(df.txt.values.tolist())

In [23]:
sbert_tokenized.shape, len(sbert_tokenized)

((3000, 384), 3000)

In [29]:
len(list([len(i) for i in sbert_tokenized]))

3000

In [30]:
attention_mask = [0] * len(sbert_tokenized)

In [31]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

TypeError: Sequential.forward() got an unexpected keyword argument 'attention_mask'