In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_reuter_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    REUTER_DATA_PATH = PATH + "reuter/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for test_train in SPLITS:
            authors_path = os.path.join(REUTER_DATA_PATH, split, test_train)
            for author in os.listdir(authors_path):
                articles_path = os.path.join(authors_path, author)
                for article in os.listdir(articles_path):

                    if article in SKIP:
                        continue
                        
                    filepath = os.path.join(articles_path, article)
                    with open(filepath, encoding="utf8") as f:
                        doc = f.read()

                    doc = doc.split()[:words_count]
                    doc = " ".join(doc)

                    filepath = filepath.replace(PATH, "")

                    label = 0
                    if split == "human":
                        label = 1

                    if verbose:
                        print(filepath)

                    names.append(filepath)
                    docs.append(doc)
                    labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [3]:
import pandas as pd

res = get_reuter_data(words_count=250)

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(15000, 3)

In [4]:
df.head()

Unnamed: 0,txt,filename,label
0,A new rule introduced by the U.S. Securities a...,reuter/data/gpt\test\AaronPressman\0.txt,0
1,Washington D.C. - Several members of Congress ...,reuter/data/gpt\test\AaronPressman\1.txt,0
2,Privacy advocates are warning that the Clinton...,reuter/data/gpt\test\AaronPressman\10.txt,0
3,A recent survey conducted by Bankrate has foun...,reuter/data/gpt\test\AaronPressman\11.txt,0
4,A recent report from banking watchdog consumer...,reuter/data/gpt\test\AaronPressman\12.txt,0


In [5]:
df.label.value_counts()

0    10000
1     5000
Name: label, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
batch_1 = df[:4000]
batch_2 = df[4000:8000]
batch_3 = df[8000:12000]
batch_4 = df[12000:]

In [8]:
batch_1 = batch_2

In [9]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [10]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [11]:
tokenized.shape, len(tokenized)

((4000,), 4000)

In [12]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [13]:
np.array(padded).shape

(4000, 434)

In [14]:
padded

array([[  101,  2120,  8203, ...,     0,     0,     0],
       [  101,  2120,  8203, ...,     0,     0,     0],
       [  101, 22245,  3886, ...,     0,     0,     0],
       ...,
       [  101,  2974, 15768, ...,     0,     0,     0],
       [  101,  6207,  3274, ...,     0,     0,     0],
       [  101, 12535, 13058, ...,     0,     0,     0]])

In [15]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(4000, 434)

In [16]:
attention_mask[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [17]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=613, microseconds=229177)

In [18]:
1590/60, 492/60

(26.5, 8.2)

In [19]:
features = last_hidden_states[0][:,0,:].numpy()

In [20]:
features

array([[ 0.03211504, -0.06255466,  0.06741614, ...,  0.0210871 ,
         0.41911998,  0.35779613],
       [-0.04662368, -0.00747711,  0.23425615, ...,  0.14383982,
         0.43076757,  0.3555147 ],
       [ 0.06204259,  0.06201564, -0.30686516, ..., -0.11237683,
         0.3737894 ,  0.13809752],
       ...,
       [-0.2243382 , -0.2323296 ,  0.02997141, ..., -0.09007698,
         0.55315304,  0.33067623],
       [-0.34614563,  0.03404425, -0.10175318, ..., -0.06368355,
         0.5269702 ,  0.34994292],
       [-0.31056613, -0.24582033, -0.13122159, ..., -0.04836513,
         0.6493024 ,  0.3968798 ]], dtype=float32)

In [21]:
len(features)

4000

In [22]:
len(batch_1.label)

4000

In [23]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [24]:
features[0]

array([ 3.21150422e-02, -6.25546649e-02,  6.74161389e-02, -3.14687371e-01,
        1.40174061e-01,  3.84835228e-02,  2.23053060e-02,  1.42491847e-01,
        1.14638180e-01, -1.19243018e-01, -2.45558485e-01,  3.82302105e-02,
        3.98791224e-01,  2.94665158e-01,  1.72925964e-01,  2.70161498e-02,
        1.14758722e-01,  4.78691846e-01,  3.17352235e-01, -1.27124339e-01,
       -4.66718554e-01, -5.10136843e-01,  4.77542400e-01,  1.08532131e-01,
        4.03909981e-01, -1.00003242e-01,  2.10358769e-01, -2.14212865e-01,
        1.94768071e-01, -1.93026215e-02,  3.85695130e-01,  1.90279067e-01,
       -3.63078207e-01, -3.71888548e-01,  3.17761779e-01,  3.69730704e-02,
       -5.32016903e-03, -1.66930452e-01, -3.49512994e-01,  1.44748434e-01,
       -2.70836167e-02,  2.41013497e-01,  9.04739201e-02,  2.92111754e-01,
       -2.02381283e-01, -1.56140476e-01, -3.49340606e+00, -2.78534949e-01,
        3.75201330e-02, -5.91834188e-02,  3.84409845e-01, -1.12139180e-01,
       -3.21866095e-01,  

In [25]:
tmp_df.sentence_embeddings.iloc[0]

[0.032115042209625244,
 -0.06255466490983963,
 0.06741613894701004,
 -0.3146873712539673,
 0.14017406105995178,
 0.038483522832393646,
 0.022305306047201157,
 0.14249184727668762,
 0.11463817954063416,
 -0.11924301832914352,
 -0.2455584853887558,
 0.03823021054267883,
 0.39879122376441956,
 0.2946651577949524,
 0.17292596399784088,
 0.02701614983379841,
 0.11475872248411179,
 0.47869184613227844,
 0.3173522353172302,
 -0.1271243393421173,
 -0.46671855449676514,
 -0.5101368427276611,
 0.4775424003601074,
 0.1085321307182312,
 0.40390998125076294,
 -0.10000324249267578,
 0.21035876870155334,
 -0.21421286463737488,
 0.19476807117462158,
 -0.019302621483802795,
 0.38569512963294983,
 0.1902790665626526,
 -0.36307820677757263,
 -0.37188854813575745,
 0.31776177883148193,
 0.03697307035326958,
 -0.005320169031620026,
 -0.1669304519891739,
 -0.3495129942893982,
 0.14474843442440033,
 -0.027083616703748703,
 0.24101349711418152,
 0.09047392010688782,
 0.29211175441741943,
 -0.20238128304481506

In [26]:
(tmp_df.sentence_embeddings.iloc[0] == features[0]).sum()

768

In [27]:
tmp_df.head(5)

Unnamed: 0,sentence_embeddings,label
4000,"[0.032115042209625244, -0.06255466490983963, 0...",0
4001,"[-0.046623680740594864, -0.007477111183106899,...",0
4002,"[0.06204259395599365, 0.06201563775539398, -0....",0
4003,"[-0.02420113794505596, 0.06432697921991348, -0...",0
4004,"[-0.019325844943523407, -0.18398965895175934, ...",0


In [28]:
tmp_df.label.value_counts()

1    3000
0    1000
Name: label, dtype: int64

In [29]:
batch_1.label.value_counts()

1    3000
0    1000
Name: label, dtype: int64

In [30]:
import dill as pickle

In [None]:
import os

if not os.path.exists('features/'):
    os.mkdir('features/')

In [31]:
pickle.dump(tmp_df, open('features_250/reuters_sentence_embeddings_batch2', 'wb'))

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

In [None]:
len(features)

In [None]:
pickle.dump(features, open('features/reuters_sentence_embeddings_batch1', 'wb'))

In [None]:
ar = pickle.load(open('features/reuters_sentence_embeddings_batch1', 'rb'))
len(ar)

In [None]:
ar.head()

In [None]:
(features[0] == ar.sentence_embeddings[0]).sum()

In [None]:
set([len(i) for i in ar])

In [None]:
ar1 = pickle.load(open('features/essay_sentence_embeddings_batch2', 'rb'))

In [None]:
ar.shape, ar1.shape

In [None]:
pd.concat([ar, ar1]).label.value_counts()