In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_reuter_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    REUTER_DATA_PATH = PATH + "reuter/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for test_train in SPLITS:
            authors_path = os.path.join(REUTER_DATA_PATH, split, test_train)
            for author in os.listdir(authors_path):
                articles_path = os.path.join(authors_path, author)
                for article in os.listdir(articles_path):

                    if article in SKIP:
                        continue
                        
                    filepath = os.path.join(articles_path, article)
                    with open(filepath, encoding="utf8") as f:
                        doc = f.read()

                    doc = doc.split()[:words_count]
                    doc = " ".join(doc)

                    filepath = filepath.replace(PATH, "")

                    label = 0
                    if split == "human":
                        label = 1

                    if verbose:
                        print(filepath)

                    names.append(filepath)
                    docs.append(doc)
                    labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [3]:
import pandas as pd

res = get_reuter_data(words_count=250)

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(15000, 3)

In [4]:
df.head()

Unnamed: 0,txt,filename,label
0,A new rule introduced by the U.S. Securities a...,reuter/data/gpt\test\AaronPressman\0.txt,0
1,Washington D.C. - Several members of Congress ...,reuter/data/gpt\test\AaronPressman\1.txt,0
2,Privacy advocates are warning that the Clinton...,reuter/data/gpt\test\AaronPressman\10.txt,0
3,A recent survey conducted by Bankrate has foun...,reuter/data/gpt\test\AaronPressman\11.txt,0
4,A recent report from banking watchdog consumer...,reuter/data/gpt\test\AaronPressman\12.txt,0


In [5]:
df.label.value_counts()

0    10000
1     5000
Name: label, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
batch_1 = df[:4000]
batch_2 = df[4000:8000]
batch_3 = df[8000:12000]
batch_4 = df[12000:]

In [8]:
batch_1 = batch_3

In [9]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [10]:
batch_1.label.value_counts()

1    2000
0    2000
Name: label, dtype: int64

In [11]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
tokenized.shape, len(tokenized)

((4000,), 4000)

In [13]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [14]:
np.array(padded).shape

(4000, 424)

In [15]:
padded

array([[  101,  1996,  4955, ...,     0,     0,     0],
       [  101,  6151, 15141, ...,     0,     0,     0],
       [  101,  2900,  2097, ...,     0,     0,     0],
       ...,
       [  101, 17534,  2162, ...,     0,     0,     0],
       [  101, 17534,  2162, ...,     0,     0,     0],
       [  101, 17534,  2162, ...,     0,     0,     0]])

In [16]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(4000, 424)

In [17]:
attention_mask[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [18]:
df[:8000].label.value_counts()

0    5000
1    3000
Name: label, dtype: int64

In [20]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=505, microseconds=279064)

In [35]:
1590/60, 492/60

(26.5, 8.2)

In [21]:
features = last_hidden_states[0][:,0,:].numpy()

In [22]:
features

array([[-0.12917136, -0.11269377, -0.04231776, ..., -0.09894063,
         0.28057423,  0.33251143],
       [-0.1269558 , -0.16340186, -0.21135683, ..., -0.00310142,
         0.40551946,  0.3187156 ],
       [-0.19176807, -0.19871856, -0.04027424, ...,  0.05795612,
         0.40362665,  0.22643904],
       ...,
       [-0.12109681, -0.37332872, -0.2734214 , ...,  0.06685681,
         0.5708191 ,  0.29822874],
       [-0.18614723, -0.25707203, -0.29321826, ...,  0.12740672,
         0.67297125,  0.332245  ],
       [-0.20006324, -0.37181276, -0.23296021, ...,  0.04861784,
         0.59235305,  0.2501811 ]], dtype=float32)

In [23]:
len(features)

4000

In [24]:
len(batch_1.label)

4000

In [25]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [26]:
features[0]

array([-1.29171357e-01, -1.12693772e-01, -4.23177555e-02,  8.87351669e-03,
       -3.32358070e-02, -3.45496163e-02, -8.41776058e-02,  2.28903502e-01,
       -2.34450534e-01,  7.41950050e-02, -1.89976737e-01, -2.30114721e-02,
        7.54706841e-03,  1.18591085e-01, -4.46787745e-01, -1.46592647e-01,
        3.14688310e-02,  3.55174124e-01,  3.73837471e-01,  9.14756432e-02,
       -3.29283774e-01, -1.85392778e-02,  5.30220747e-01,  2.48107582e-01,
        6.64820224e-02, -3.41593534e-01,  2.99163312e-01,  1.39944851e-01,
       -2.32096344e-01, -1.31085426e-01,  1.98571548e-01,  6.27169967e-01,
       -1.43751115e-01, -1.17478274e-01,  2.68375456e-01, -1.25609696e-01,
        1.42157078e-03, -2.51505494e-01, -2.79558152e-01, -1.61018729e-01,
       -1.14210248e-01,  3.97139579e-01, -1.47434762e-02,  3.50017577e-01,
        7.31205493e-02, -6.16819002e-02, -3.27762604e+00, -1.09047532e-01,
       -2.70301044e-01, -3.15119982e-01,  3.05378020e-01, -1.16172731e-02,
        6.01601526e-02,  

In [27]:
tmp_df.sentence_embeddings.iloc[0]

[-0.12917135655879974,
 -0.11269377171993256,
 -0.04231775552034378,
 0.00887351669371128,
 -0.033235806971788406,
 -0.034549616277217865,
 -0.08417760580778122,
 0.22890350222587585,
 -0.2344505339860916,
 0.0741950049996376,
 -0.1899767369031906,
 -0.023011472076177597,
 0.0075470684096217155,
 0.11859108507633209,
 -0.4467877447605133,
 -0.1465926468372345,
 0.03146883100271225,
 0.35517412424087524,
 0.3738374710083008,
 0.09147564321756363,
 -0.32928377389907837,
 -0.018539277836680412,
 0.5302207469940186,
 0.24810758233070374,
 0.06648202240467072,
 -0.34159353375434875,
 0.2991633117198944,
 0.13994485139846802,
 -0.2320963442325592,
 -0.13108542561531067,
 0.19857154786586761,
 0.6271699666976929,
 -0.1437511146068573,
 -0.11747827380895615,
 0.2683754563331604,
 -0.12560969591140747,
 0.0014215707778930664,
 -0.2515054941177368,
 -0.2795581519603729,
 -0.1610187292098999,
 -0.11421024799346924,
 0.3971395790576935,
 -0.014743476174771786,
 0.35001757740974426,
 0.073120549321

In [28]:
(tmp_df.sentence_embeddings.iloc[0] == features[0]).sum()

768

In [29]:
tmp_df.head(5)

Unnamed: 0,sentence_embeddings,label
8000,"[-0.12917135655879974, -0.11269377171993256, -...",1
8001,"[-0.1269558072090149, -0.16340185701847076, -0...",1
8002,"[-0.19176806509494781, -0.19871856272220612, -...",1
8003,"[-0.33469370007514954, -0.1825793832540512, -0...",1
8004,"[-0.235978364944458, -0.2063952535390854, -0.0...",1


In [30]:
tmp_df.label.value_counts()

1    2000
0    2000
Name: label, dtype: int64

In [31]:
batch_1.label.value_counts()

1    2000
0    2000
Name: label, dtype: int64

In [32]:
import dill as pickle

In [None]:
import os

if not os.path.exists('features/'):
    os.mkdir('features/')

In [33]:
pickle.dump(tmp_df, open('features_250/reuters_sentence_embeddings_batch3', 'wb'))

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

In [None]:
len(features)

In [None]:
pickle.dump(features, open('features/reuters_sentence_embeddings_batch1', 'wb'))

In [None]:
ar = pickle.load(open('features/reuters_sentence_embeddings_batch1', 'rb'))
len(ar)

In [None]:
ar.head()

In [None]:
(features[0] == ar.sentence_embeddings[0]).sum()

In [None]:
set([len(i) for i in ar])

In [None]:
ar1 = pickle.load(open('features/essay_sentence_embeddings_batch2', 'rb'))

In [None]:
ar.shape, ar1.shape

In [None]:
pd.concat([ar, ar1]).label.value_counts()