In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_reuter_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    REUTER_DATA_PATH = PATH + "reuter/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for test_train in SPLITS:
            authors_path = os.path.join(REUTER_DATA_PATH, split, test_train)
            for author in os.listdir(authors_path):
                articles_path = os.path.join(authors_path, author)
                for article in os.listdir(articles_path):

                    if article in SKIP:
                        continue
                        
                    filepath = os.path.join(articles_path, article)
                    with open(filepath, encoding="utf8") as f:
                        doc = f.read()

                    doc = doc.split()[:words_count]
                    doc = " ".join(doc)

                    filepath = filepath.replace(PATH, "")

                    label = 0
                    if split == "human":
                        label = 1

                    if verbose:
                        print(filepath)

                    names.append(filepath)
                    docs.append(doc)
                    labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [3]:
import pandas as pd

res = get_reuter_data()

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(15000, 3)

In [4]:
df.head()

Unnamed: 0,txt,filename,label
0,A new rule introduced by the U.S. Securities a...,reuter/data/gpt\test\AaronPressman\0.txt,0
1,Washington D.C. - Several members of Congress ...,reuter/data/gpt\test\AaronPressman\1.txt,0
2,Privacy advocates are warning that the Clinton...,reuter/data/gpt\test\AaronPressman\10.txt,0
3,A recent survey conducted by Bankrate has foun...,reuter/data/gpt\test\AaronPressman\11.txt,0
4,A recent report from banking watchdog consumer...,reuter/data/gpt\test\AaronPressman\12.txt,0


In [5]:
df.label.value_counts()

0    10000
1     5000
Name: label, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
batch_1 = df[:4000]
batch_2 = df[4000:8000]
batch_3 = df[8000:12000]
batch_4 = df[12000:]

In [9]:
batch_1 = batch_2

In [9]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [10]:
batch_1.label.value_counts()

1    3000
0    1000
Name: label, dtype: int64

In [11]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
tokenized.shape, len(tokenized)

((4000,), 4000)

In [13]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [14]:
np.array(padded).shape

(4000, 512)

In [15]:
padded

array([[  101,  2120,  8203, ...,  2421, 16371,   102],
       [  101,  2120,  8203, ...,     0,     0,     0],
       [  101, 22245,  3886, ..., 19939,  3310,   102],
       ...,
       [  101,  2974, 15768, ..., 10108,  1010,   102],
       [  101,  6207,  3274, ...,  1005,  1055,   102],
       [  101, 12535, 13058, ..., 16664,  6512,   102]])

In [16]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(4000, 512)

In [17]:
attention_mask[:5]

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [18]:
df[:8000].label.value_counts()

0    5000
1    3000
Name: label, dtype: int64

In [19]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=1672, microseconds=310655)

In [35]:
1590/60, 492/60

(26.5, 8.2)

In [20]:
features = last_hidden_states[0][:,0,:].numpy()

In [21]:
features

array([[-0.0693151 , -0.12781562,  0.0828902 , ...,  0.04343815,
         0.42203382,  0.3659998 ],
       [-0.04662348, -0.00747705,  0.23425627, ...,  0.14383985,
         0.43076766,  0.35551485],
       [ 0.05893181,  0.04184513, -0.31971806, ..., -0.04637596,
         0.44957623,  0.1477008 ],
       ...,
       [-0.2388515 , -0.22968979,  0.01584487, ..., -0.08623928,
         0.56943476,  0.34156415],
       [-0.41345277, -0.05492041, -0.17189497, ..., -0.06285364,
         0.60592586,  0.37155247],
       [-0.31953323, -0.18802035, -0.12650247, ..., -0.01819845,
         0.67887324,  0.41901055]], dtype=float32)

In [22]:
len(features)

4000

In [23]:
len(batch_1.label)

4000

In [24]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [25]:
features[0]

array([-6.93150982e-02, -1.27815619e-01,  8.28901976e-02, -2.67907083e-01,
        1.42412558e-01,  4.64900136e-02,  3.07008568e-02,  1.50577143e-01,
        9.61278155e-02, -7.22398385e-02, -2.24975422e-01, -3.96960638e-02,
        3.73028934e-01,  3.36471140e-01,  1.70836136e-01,  1.03410687e-02,
        1.56194091e-01,  4.47357863e-01,  3.67190778e-01, -1.15187593e-01,
       -4.83881891e-01, -4.47477847e-01,  5.24978995e-01,  4.18364517e-02,
        4.06065881e-01, -1.31874263e-01,  2.13312119e-01, -2.11114109e-01,
        1.29409164e-01, -6.33016601e-03,  3.70583355e-01,  1.73523560e-01,
       -3.50347549e-01, -3.50906402e-01,  3.54573816e-01,  1.68166421e-02,
       -5.67246415e-02, -2.24730164e-01, -4.23528969e-01,  1.23075858e-01,
       -4.63651828e-02,  2.49858320e-01,  9.93030369e-02,  3.77486289e-01,
       -2.15913281e-01, -1.62289172e-01, -3.64367461e+00, -2.67901838e-01,
       -2.06074677e-02, -7.54585341e-02,  3.53979945e-01, -1.20049089e-01,
       -2.92682022e-01,  

In [26]:
tmp_df.sentence_embeddings.iloc[0]

[-0.0693150982260704,
 -0.1278156191110611,
 0.08289019763469696,
 -0.2679070830345154,
 0.14241255819797516,
 0.04649001359939575,
 0.03070085681974888,
 0.1505771428346634,
 0.09612781554460526,
 -0.07223983854055405,
 -0.22497542202472687,
 -0.039696063846349716,
 0.3730289340019226,
 0.3364711403846741,
 0.17083613574504852,
 0.010341068729758263,
 0.15619409084320068,
 0.4473578631877899,
 0.36719077825546265,
 -0.11518759280443192,
 -0.4838818907737732,
 -0.4474778473377228,
 0.5249789953231812,
 0.0418364517390728,
 0.4060658812522888,
 -0.13187426328659058,
 0.21331211924552917,
 -0.21111410856246948,
 0.12940916419029236,
 -0.006330166012048721,
 0.37058335542678833,
 0.17352356016635895,
 -0.3503475487232208,
 -0.3509064018726349,
 0.3545738160610199,
 0.0168166421353817,
 -0.05672464147210121,
 -0.22473016381263733,
 -0.4235289692878723,
 0.12307585775852203,
 -0.04636518284678459,
 0.2498583197593689,
 0.09930303692817688,
 0.37748628854751587,
 -0.21591328084468842,
 -0.16

In [27]:
(tmp_df.sentence_embeddings.iloc[0] == features[0]).sum()

768

In [28]:
tmp_df.head(5)

Unnamed: 0,sentence_embeddings,label
4000,"[-0.0693150982260704, -0.1278156191110611, 0.0...",0
4001,"[-0.04662347584962845, -0.007477045990526676, ...",0
4002,"[0.058931805193424225, 0.04184512794017792, -0...",0
4003,"[-0.02420114167034626, 0.06432699412107468, -0...",0
4004,"[-0.019325844943523407, -0.18398965895175934, ...",0


In [29]:
tmp_df.label.value_counts()

1    3000
0    1000
Name: label, dtype: int64

In [30]:
batch_1.label.value_counts()

1    3000
0    1000
Name: label, dtype: int64

In [31]:
import dill as pickle

In [None]:
import os

if not os.path.exists('features/'):
    os.mkdir('features/')

In [32]:
pickle.dump(tmp_df, open('features/reuters_sentence_embeddings_batch2', 'wb'))

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

In [None]:
len(features)

In [None]:
pickle.dump(features, open('features/reuters_sentence_embeddings_batch1', 'wb'))

In [None]:
ar = pickle.load(open('features/reuters_sentence_embeddings_batch1', 'rb'))
len(ar)

In [None]:
ar.head()

In [None]:
(features[0] == ar.sentence_embeddings[0]).sum()

In [None]:
set([len(i) for i in ar])

In [None]:
ar1 = pickle.load(open('features/essay_sentence_embeddings_batch2', 'rb'))

In [None]:
ar.shape, ar1.shape

In [None]:
pd.concat([ar, ar1]).label.value_counts()