In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_reuter_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    REUTER_DATA_PATH = PATH + "reuter/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for test_train in SPLITS:
            authors_path = os.path.join(REUTER_DATA_PATH, split, test_train)
            for author in os.listdir(authors_path):
                articles_path = os.path.join(authors_path, author)
                for article in os.listdir(articles_path):

                    if article in SKIP:
                        continue
                        
                    filepath = os.path.join(articles_path, article)
                    with open(filepath, encoding="utf8") as f:
                        doc = f.read()

                    doc = doc.split()[:words_count]
                    doc = " ".join(doc)

                    filepath = filepath.replace(PATH, "")

                    label = 0
                    if split == "human":
                        label = 1

                    if verbose:
                        print(filepath)

                    names.append(filepath)
                    docs.append(doc)
                    labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [3]:
import pandas as pd

res = get_reuter_data()

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(15000, 3)

In [4]:
df.head()

Unnamed: 0,txt,filename,label
0,A new rule introduced by the U.S. Securities a...,reuter/data/gpt\test\AaronPressman\0.txt,0
1,Washington D.C. - Several members of Congress ...,reuter/data/gpt\test\AaronPressman\1.txt,0
2,Privacy advocates are warning that the Clinton...,reuter/data/gpt\test\AaronPressman\10.txt,0
3,A recent survey conducted by Bankrate has foun...,reuter/data/gpt\test\AaronPressman\11.txt,0
4,A recent report from banking watchdog consumer...,reuter/data/gpt\test\AaronPressman\12.txt,0


In [5]:
df.label.value_counts()

0    10000
1     5000
Name: label, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
batch_1 = df[:4000]
batch_2 = df[4000:8000]
batch_3 = df[8000:12000]
batch_4 = df[12000:]

In [8]:
batch_1 = batch_4

In [9]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [10]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [11]:
tokenized.shape, len(tokenized)

((3000,), 3000)

In [12]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [13]:
np.array(padded).shape

(3000, 512)

In [14]:
padded

array([[  101, 21618,  4455, ...,     0,     0,     0],
       [  101,  2694,  8599, ...,  1523,  2593,   102],
       [  101,  2047,  8843, ...,     0,     0,     0],
       ...,
       [  101,  2859, 11014, ...,     0,     0,     0],
       [  101,  2859,  2000, ...,     0,     0,     0],
       [  101,  2859, 10592, ...,     0,     0,     0]])

In [15]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(3000, 512)

In [16]:
attention_mask[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0]])

In [17]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=492, microseconds=467068)

In [35]:
1590/60, 492/60

(26.5, 8.2)

In [18]:
features = last_hidden_states[0][:,0,:].numpy()

In [19]:
features

array([[-0.07680774, -0.11137304, -0.02526254, ...,  0.22369212,
         0.6088286 ,  0.02941341],
       [-0.10809352, -0.1983093 , -0.08356862, ..., -0.0975038 ,
         0.5172362 ,  0.28442636],
       [-0.23286977, -0.03163859, -0.22474207, ..., -0.04332055,
         0.4975668 ,  0.32170528],
       ...,
       [-0.17496288, -0.19077797, -0.10261291, ..., -0.01908697,
         0.49992877,  0.05342557],
       [-0.31396565, -0.10640437, -0.09141605, ..., -0.0670652 ,
         0.36347446,  0.153258  ],
       [-0.44224557, -0.22254884, -0.05738679, ..., -0.00994536,
         0.4138738 ,  0.09399023]], dtype=float32)

In [20]:
len(features)

3000

In [21]:
len(batch_1.label)

3000

In [22]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [23]:
features[0]

array([-7.68077448e-02, -1.11373037e-01, -2.52625421e-02, -1.07050449e-01,
        2.25064039e-01, -5.68270743e-01,  4.24211174e-02, -2.06460863e-01,
        9.47104767e-02, -1.12299144e-01, -1.43664628e-01,  1.58457473e-01,
       -2.82743305e-01,  3.92346799e-01, -3.12081486e-01,  2.91654289e-01,
        1.13023557e-01,  2.94497490e-01,  4.12539303e-01,  1.94901168e-01,
       -6.08205236e-02, -2.25162685e-01,  4.64756072e-01,  2.89317608e-01,
        8.60237181e-02, -3.93789858e-01, -2.85288636e-02, -2.56377041e-01,
       -2.02426404e-01,  2.23384619e-01,  2.61221439e-01,  2.52333641e-01,
       -2.80577004e-01, -3.19554687e-01,  4.38649766e-02,  1.37225449e-01,
        4.66826260e-01, -2.45873988e-01, -2.44252503e-01,  9.05167013e-02,
       -2.83977866e-01,  2.58393019e-01, -1.35332793e-01,  1.12530395e-01,
        2.63129920e-02,  1.49944976e-01, -2.99564672e+00, -9.75274891e-02,
       -1.65287942e-01, -2.78794706e-01,  5.12546301e-02, -5.57250455e-02,
       -2.79690266e-01,  

In [25]:
tmp_df.sentence_embeddings.iloc[0]

[-0.07680774480104446,
 -0.11137303709983826,
 -0.025262542068958282,
 -0.10705044865608215,
 0.22506403923034668,
 -0.568270742893219,
 0.042421117424964905,
 -0.2064608633518219,
 0.09471047669649124,
 -0.11229914426803589,
 -0.1436646282672882,
 0.15845747292041779,
 -0.28274330496788025,
 0.3923467993736267,
 -0.3120814859867096,
 0.2916542887687683,
 0.1130235567688942,
 0.2944974899291992,
 0.4125393033027649,
 0.19490116834640503,
 -0.06082052364945412,
 -0.22516268491744995,
 0.4647560715675354,
 0.28931760787963867,
 0.0860237181186676,
 -0.3937898576259613,
 -0.028528863564133644,
 -0.25637704133987427,
 -0.20242640376091003,
 0.22338461875915527,
 0.26122143864631653,
 0.2523336410522461,
 -0.28057700395584106,
 -0.3195546865463257,
 0.04386497661471367,
 0.1372254490852356,
 0.46682626008987427,
 -0.24587398767471313,
 -0.2442525029182434,
 0.09051670134067535,
 -0.2839778661727905,
 0.2583930194377899,
 -0.13533279299736023,
 0.11253039538860321,
 0.026312991976737976,
 0.

In [26]:
(tmp_df.sentence_embeddings.iloc[0] == features[0]).sum()

768

In [27]:
tmp_df.head(5)

Unnamed: 0,sentence_embeddings,label
12000,"[-0.07680774480104446, -0.11137303709983826, -...",0
12001,"[-0.1080935150384903, -0.1983093023300171, -0....",0
12002,"[-0.23286977410316467, -0.03163858875632286, -...",0
12003,"[-0.11566679179668427, -0.1288173943758011, -0...",0
12004,"[0.02025720477104187, 0.20163628458976746, -0....",0


In [30]:
tmp_df.label.value_counts()

0    3000
Name: label, dtype: int64

In [32]:
batch_1.label.value_counts()

0    3000
Name: label, dtype: int64

In [33]:
import dill as pickle

In [None]:
import os

if not os.path.exists('features/'):
    os.mkdir('features/')

In [34]:
pickle.dump(tmp_df, open('features/reuters_sentence_embeddings_batch4', 'wb'))

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

In [None]:
len(features)

In [None]:
pickle.dump(features, open('features/reuters_sentence_embeddings_batch1', 'wb'))

In [None]:
ar = pickle.load(open('features/reuters_sentence_embeddings_batch1', 'rb'))
len(ar)

In [None]:
ar.head()

In [None]:
(features[0] == ar.sentence_embeddings[0]).sum()

In [None]:
set([len(i) for i in ar])

In [None]:
ar1 = pickle.load(open('features/essay_sentence_embeddings_batch2', 'rb'))

In [None]:
ar.shape, ar1.shape

In [None]:
pd.concat([ar, ar1]).label.value_counts()