In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_reuter_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    REUTER_DATA_PATH = PATH + "reuter/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for test_train in SPLITS:
            authors_path = os.path.join(REUTER_DATA_PATH, split, test_train)
            for author in os.listdir(authors_path):
                articles_path = os.path.join(authors_path, author)
                for article in os.listdir(articles_path):

                    if article in SKIP:
                        continue
                        
                    filepath = os.path.join(articles_path, article)
                    with open(filepath, encoding="utf8") as f:
                        doc = f.read()

                    doc = doc.split()[:words_count]
                    doc = " ".join(doc)

                    filepath = filepath.replace(PATH, "")

                    label = 0
                    if split == "human":
                        label = 1

                    if verbose:
                        print(filepath)

                    names.append(filepath)
                    docs.append(doc)
                    labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [6]:
import pandas as pd

res = get_reuter_data(words_count=250)

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(15000, 3)

In [9]:
df.head()

Unnamed: 0,txt,filename,label
0,A new rule introduced by the U.S. Securities a...,reuter/data/gpt\test\AaronPressman\0.txt,0
1,Washington D.C. - Several members of Congress ...,reuter/data/gpt\test\AaronPressman\1.txt,0
2,Privacy advocates are warning that the Clinton...,reuter/data/gpt\test\AaronPressman\10.txt,0
3,A recent survey conducted by Bankrate has foun...,reuter/data/gpt\test\AaronPressman\11.txt,0
4,A recent report from banking watchdog consumer...,reuter/data/gpt\test\AaronPressman\12.txt,0


In [10]:
df.label.value_counts()

0    10000
1     5000
Name: label, dtype: int64

In [11]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
batch_1 = df[:4000]
batch_2 = df[4000:8000]
batch_3 = df[8000:12000]
batch_4 = df[12000:]

In [13]:
batch_1 = batch_4

In [14]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [15]:
batch_1.label.value_counts()

0    3000
Name: label, dtype: int64

In [16]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
tokenized.shape, len(tokenized)

((3000,), 3000)

In [18]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [19]:
np.array(padded).shape

(3000, 434)

In [20]:
padded

array([[  101, 21618,  4455, ...,     0,     0,     0],
       [  101,  2694,  8599, ...,     0,     0,     0],
       [  101,  2047,  8843, ...,     0,     0,     0],
       ...,
       [  101,  2859, 11014, ...,     0,     0,     0],
       [  101,  2859,  2000, ...,     0,     0,     0],
       [  101,  2859, 10592, ...,     0,     0,     0]])

In [21]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(3000, 434)

In [22]:
attention_mask[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [23]:
df[8000:12000].label.value_counts()

1    2000
0    2000
Name: label, dtype: int64

In [24]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=397, microseconds=379597)

In [20]:
1590/60, 492/60

(26.5, 8.2)

In [25]:
features = last_hidden_states[0][:,0,:].numpy()

In [26]:
features

array([[-0.10500193, -0.0461014 , -0.10815503, ...,  0.24913375,
         0.6345957 ,  0.08215468],
       [-0.08686368, -0.16944194, -0.03418288, ..., -0.12612788,
         0.5082677 ,  0.34015152],
       [-0.28139746, -0.10788709, -0.07206146, ..., -0.08298188,
         0.4183575 ,  0.41502616],
       ...,
       [-0.15314826, -0.19530433, -0.05826163, ..., -0.04951119,
         0.51054305,  0.00987758],
       [-0.34330133, -0.13603324, -0.17362063, ..., -0.06201617,
         0.3677088 ,  0.1106162 ],
       [-0.36165982, -0.2837707 , -0.05318333, ..., -0.00123961,
         0.42407584,  0.12968625]], dtype=float32)

In [27]:
len(features)

3000

In [28]:
len(batch_1.label)

3000

In [29]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [30]:
features[0]

array([-1.05001926e-01, -4.61013988e-02, -1.08155027e-01, -1.69214353e-01,
        2.11969033e-01, -5.27162075e-01,  5.75942844e-02, -1.71958745e-01,
        9.83860418e-02, -1.00349598e-01, -1.32177860e-01,  1.32924393e-01,
       -3.25778991e-01,  3.44004899e-01, -3.03696305e-01,  2.03656986e-01,
        1.13418847e-01,  2.38356769e-01,  3.92995000e-01,  1.65899932e-01,
       -4.43479717e-02, -2.42656529e-01,  4.35870588e-01,  2.85592437e-01,
        6.81568086e-02, -3.24937552e-01,  9.89660621e-05, -1.69884741e-01,
       -2.21565574e-01,  2.00237527e-01,  3.07494730e-01,  2.38705724e-01,
       -2.86718726e-01, -3.57185513e-01,  2.96662264e-02,  1.20551825e-01,
        4.14195746e-01, -3.15150082e-01, -2.55433172e-01,  9.50118601e-02,
       -2.06328213e-01,  2.31057495e-01, -1.65699899e-01,  6.32035583e-02,
        1.38941333e-02,  1.41087934e-01, -2.82743096e+00, -7.57237226e-02,
       -2.41071373e-01, -2.51477361e-01,  6.82872832e-02, -3.17212939e-02,
       -3.38436037e-01,  

In [31]:
tmp_df.sentence_embeddings.iloc[0]

[-0.10500192642211914,
 -0.0461013987660408,
 -0.1081550270318985,
 -0.16921435296535492,
 0.2119690328836441,
 -0.5271620750427246,
 0.057594284415245056,
 -0.17195874452590942,
 0.09838604182004929,
 -0.10034959763288498,
 -0.13217785954475403,
 0.1329243928194046,
 -0.325778990983963,
 0.34400489926338196,
 -0.30369630455970764,
 0.20365698635578156,
 0.11341884732246399,
 0.23835676908493042,
 0.3929949998855591,
 0.16589993238449097,
 -0.04434797167778015,
 -0.24265652894973755,
 0.43587058782577515,
 0.2855924367904663,
 0.06815680861473083,
 -0.3249375522136688,
 9.896606206893921e-05,
 -0.16988474130630493,
 -0.22156557440757751,
 0.20023752748966217,
 0.30749472975730896,
 0.23870572447776794,
 -0.2867187261581421,
 -0.3571855127811432,
 0.029666226357221603,
 0.12055182456970215,
 0.4141957461833954,
 -0.31515008211135864,
 -0.25543317198753357,
 0.09501186013221741,
 -0.20632821321487427,
 0.23105749487876892,
 -0.16569989919662476,
 0.06320355832576752,
 0.01389413326978683

In [32]:
(tmp_df.sentence_embeddings.iloc[0] == features[0]).sum()

768

In [33]:
tmp_df.head(5)

Unnamed: 0,sentence_embeddings,label
12000,"[-0.10500192642211914, -0.0461013987660408, -0...",0
12001,"[-0.0868636816740036, -0.16944193840026855, -0...",0
12002,"[-0.2813974618911743, -0.10788708925247192, -0...",0
12003,"[0.08351825177669525, -0.09060043096542358, -0...",0
12004,"[0.07574129849672318, 0.1699775755405426, -0.3...",0


In [34]:
tmp_df.label.value_counts()

0    3000
Name: label, dtype: int64

In [35]:
batch_1.label.value_counts()

0    3000
Name: label, dtype: int64

In [36]:
import dill as pickle

In [None]:
import os

if not os.path.exists('features/'):
    os.mkdir('features/')

In [37]:
pickle.dump(tmp_df, open('features_250/reuters_sentence_embeddings_batch4', 'wb'))

In [34]:
12

12

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

In [None]:
len(features)

In [None]:
pickle.dump(features, open('features/reuters_sentence_embeddings_batch1', 'wb'))

In [None]:
ar = pickle.load(open('features/reuters_sentence_embeddings_batch1', 'rb'))
len(ar)

In [None]:
ar.head()

In [None]:
(features[0] == ar.sentence_embeddings[0]).sum()

In [None]:
set([len(i) for i in ar])

In [None]:
ar1 = pickle.load(open('features/essay_sentence_embeddings_batch2', 'rb'))

In [None]:
ar.shape, ar1.shape

In [None]:
pd.concat([ar, ar1]).label.value_counts()