In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_reuter_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    REUTER_DATA_PATH = PATH + "reuter/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for test_train in SPLITS:
            authors_path = os.path.join(REUTER_DATA_PATH, split, test_train)
            for author in os.listdir(authors_path):
                articles_path = os.path.join(authors_path, author)
                for article in os.listdir(articles_path):

                    if article in SKIP:
                        continue
                        
                    filepath = os.path.join(articles_path, article)
                    with open(filepath, encoding="utf8") as f:
                        doc = f.read()

                    doc = doc.split()[:words_count]
                    doc = " ".join(doc)

                    filepath = filepath.replace(PATH, "")

                    label = 0
                    if split == "human":
                        label = 1

                    if verbose:
                        print(filepath)

                    names.append(filepath)
                    docs.append(doc)
                    labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [3]:
import pandas as pd

res = get_reuter_data(words_count=250)

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(15000, 3)

In [4]:
df.head()

Unnamed: 0,txt,filename,label
0,A new rule introduced by the U.S. Securities a...,reuter/data/gpt\test\AaronPressman\0.txt,0
1,Washington D.C. - Several members of Congress ...,reuter/data/gpt\test\AaronPressman\1.txt,0
2,Privacy advocates are warning that the Clinton...,reuter/data/gpt\test\AaronPressman\10.txt,0
3,A recent survey conducted by Bankrate has foun...,reuter/data/gpt\test\AaronPressman\11.txt,0
4,A recent report from banking watchdog consumer...,reuter/data/gpt\test\AaronPressman\12.txt,0


In [5]:
df.label.value_counts()

0    10000
1     5000
Name: label, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
batch_1 = df[:4000]

In [8]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [9]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
tokenized.shape, len(tokenized)

((4000,), 4000)

In [11]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [12]:
np.array(padded).shape

(4000, 400)

In [13]:
padded

array([[  101,  1037,  2047, ...,     0,     0,     0],
       [  101,  2899,  1040, ...,     0,     0,     0],
       [  101,  9394, 13010, ...,     0,     0,     0],
       ...,
       [  101,  5126,  1997, ...,     0,     0,     0],
       [  101,  2605, 18126, ...,     0,     0,     0],
       [  101,  4830,  7974, ...,     0,     0,     0]])

In [14]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(4000, 400)

In [15]:
attention_mask[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [16]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=455, microseconds=475159)

In [17]:
455/60

7.583333333333333

In [18]:
features = last_hidden_states[0][:,0,:].numpy()

In [19]:
features

array([[-0.2989698 , -0.17210774, -0.34800154, ...,  0.05440101,
         0.17356539,  0.5624006 ],
       [-0.38940328, -0.23688586, -0.6746397 , ...,  0.01850572,
         0.19393313,  0.48146963],
       [-0.23464464, -0.15419051, -0.60618347, ..., -0.06316832,
         0.11649636,  0.5524039 ],
       ...,
       [-0.0688751 , -0.10304363,  0.01301177, ...,  0.00180029,
         0.56056476,  0.42009455],
       [ 0.02641976, -0.13579503, -0.08534467, ..., -0.07194791,
         0.5769197 ,  0.28063306],
       [-0.20639378,  0.02566491, -0.11941466, ...,  0.01792277,
         0.52604556,  0.24400865]], dtype=float32)

In [20]:
len(features)

4000

In [21]:
len(batch_1.label)

4000

In [22]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [23]:
features[0]

array([-2.98969805e-01, -1.72107741e-01, -3.48001540e-01, -7.25538433e-02,
        1.94386721e-01, -3.66080076e-01, -1.01008110e-01, -1.09089494e-01,
       -4.89578284e-02, -4.57509942e-02, -6.66083619e-02,  3.58987749e-02,
       -2.07203537e-01,  1.84446409e-01, -2.97819108e-01, -8.95755924e-03,
        1.94272012e-01,  2.56010175e-01,  4.82510507e-01, -7.56954476e-02,
       -3.47846717e-01, -1.30263954e-01,  3.49175513e-01,  2.29608327e-01,
        1.94765404e-01, -1.26623049e-01, -1.66604578e-01, -1.84588850e-01,
       -1.52588785e-02,  2.60025054e-01,  5.34716725e-01,  4.26551044e-01,
       -4.79605913e-01, -3.18405896e-01,  3.67043495e-01,  7.57046044e-02,
        8.01874846e-02, -4.87957537e-01, -3.68792057e-01,  1.67011023e-01,
       -6.73784018e-02,  4.28408921e-01, -9.96624827e-02,  1.28082752e-01,
       -1.70326605e-02,  1.63248748e-01, -3.34729505e+00, -5.88218197e-02,
       -6.53639138e-02, -2.83565879e-01,  1.46864563e-01, -6.13121092e-02,
       -2.09987432e-01,  

In [24]:
tmp_df.sentence_embeddings[0]

[-0.2989698052406311,
 -0.1721077412366867,
 -0.34800153970718384,
 -0.0725538432598114,
 0.19438672065734863,
 -0.36608007550239563,
 -0.1010081097483635,
 -0.10908949375152588,
 -0.04895782843232155,
 -0.045750994235277176,
 -0.06660836189985275,
 0.03589877486228943,
 -0.20720353722572327,
 0.18444640934467316,
 -0.2978191077709198,
 -0.008957559242844582,
 0.1942720115184784,
 0.25601017475128174,
 0.482510507106781,
 -0.0756954476237297,
 -0.34784671664237976,
 -0.13026395440101624,
 0.34917551279067993,
 0.22960832715034485,
 0.19476540386676788,
 -0.12662304937839508,
 -0.16660457849502563,
 -0.18458884954452515,
 -0.015258878469467163,
 0.2600250542163849,
 0.5347167253494263,
 0.42655104398727417,
 -0.47960591316223145,
 -0.3184058964252472,
 0.36704349517822266,
 0.07570460438728333,
 0.08018748462200165,
 -0.48795753717422485,
 -0.3687920570373535,
 0.16701102256774902,
 -0.06737840175628662,
 0.4284089207649231,
 -0.09966248273849487,
 0.1280827522277832,
 -0.01703266054391

In [25]:
(tmp_df.sentence_embeddings[0] == features[0]).sum()

768

In [27]:
tmp_df.shape

(4000, 2)

In [28]:
import dill as pickle

In [None]:
import os

if not os.path.exists('features_250/'):
    os.mkdir('features_250/')

In [36]:
pickle.dump(tmp_df, open('features_250/reuters_sentence_embeddings_batch1', 'wb'))

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

In [None]:
len(features)

In [None]:
pickle.dump(features, open('features_250/reuters_sentence_embeddings_batch1', 'wb'))

In [29]:
ar = pickle.load(open('features/reuters_sentence_embeddings_batch1', 'rb'))
len(ar)

4000

In [30]:
ar.head()

Unnamed: 0,sentence_embeddings,label
0,"[-0.3023219108581543, -0.16762316226959229, -0...",0
1,"[-0.3721330463886261, -0.3275570571422577, -0....",0
2,"[-0.3380170166492462, -0.11618966609239578, -0...",0
3,"[-0.25932276248931885, -0.1234612688422203, -0...",0
4,"[-0.37114599347114563, -0.14960415661334991, -...",0


In [31]:
(features[0] == ar.sentence_embeddings[0]).sum()

0

In [32]:
set([len(i) for i in ar])

{5, 19}

In [33]:
ar1 = pickle.load(open('features_250/essay_sentence_embeddings_batch2', 'rb'))

In [34]:
ar.shape, ar1.shape

((4000, 2), (332, 2))

In [35]:
pd.concat([ar, ar1]).label.value_counts()

0    4332
Name: label, dtype: int64