In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_reuter_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    REUTER_DATA_PATH = PATH + "reuter/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for test_train in SPLITS:
            authors_path = os.path.join(REUTER_DATA_PATH, split, test_train)
            for author in os.listdir(authors_path):
                articles_path = os.path.join(authors_path, author)
                for article in os.listdir(articles_path):

                    if article in SKIP:
                        continue
                        
                    filepath = os.path.join(articles_path, article)
                    with open(filepath, encoding="utf8") as f:
                        doc = f.read()

                    doc = doc.split()[:words_count]
                    doc = " ".join(doc)

                    filepath = filepath.replace(PATH, "")

                    label = 0
                    if split == "human":
                        label = 1

                    if verbose:
                        print(filepath)

                    names.append(filepath)
                    docs.append(doc)
                    labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [3]:
import pandas as pd

res = get_reuter_data()

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(15000, 3)

In [4]:
df.head()

Unnamed: 0,txt,filename,label
0,A new rule introduced by the U.S. Securities a...,reuter/data/gpt\test\AaronPressman\0.txt,0
1,Washington D.C. - Several members of Congress ...,reuter/data/gpt\test\AaronPressman\1.txt,0
2,Privacy advocates are warning that the Clinton...,reuter/data/gpt\test\AaronPressman\10.txt,0
3,A recent survey conducted by Bankrate has foun...,reuter/data/gpt\test\AaronPressman\11.txt,0
4,A recent report from banking watchdog consumer...,reuter/data/gpt\test\AaronPressman\12.txt,0


In [5]:
df.label.value_counts()

0    10000
1     5000
Name: label, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
batch_1 = df[:4000]
batch_2 = df[4000:8000]
batch_3 = df[8000:12000]
batch_4 = df[12000:]

In [8]:
batch_1 = batch_3

In [9]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [10]:
batch_1.label.value_counts()

1    2000
0    2000
Name: label, dtype: int64

In [11]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
tokenized.shape, len(tokenized)

((4000,), 4000)

In [13]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [14]:
np.array(padded).shape

(4000, 512)

In [15]:
padded

array([[  101,  1996,  4955, ...,  1996,  3988,   102],
       [  101,  6151, 15141, ...,  1015,  1010,   102],
       [  101,  2900,  2097, ...,  2056,  1012,   102],
       ...,
       [  101, 17534,  2162, ...,  3643,  2000,   102],
       [  101, 17534,  2162, ...,  2391,  2003,   102],
       [  101, 17534,  2162, ...,     0,     0,     0]])

In [16]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(4000, 512)

In [17]:
attention_mask[:5]

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [33]:
df[8000:12000].label.value_counts()

1    2000
0    2000
Name: label, dtype: int64

In [19]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=1450, microseconds=53727)

In [20]:
1590/60, 492/60

(26.5, 8.2)

In [21]:
features = last_hidden_states[0][:,0,:].numpy()

In [22]:
features

array([[-0.08560184, -0.0310876 , -0.12356327, ...,  0.01909603,
         0.28565162,  0.41029936],
       [-0.16037434, -0.15253724, -0.24905007, ...,  0.03209493,
         0.42289144,  0.3775603 ],
       [-0.20680141, -0.18777274, -0.16987365, ...,  0.02910433,
         0.40314105,  0.2644435 ],
       ...,
       [-0.13748454, -0.28544745, -0.3046983 , ..., -0.03286542,
         0.6030063 ,  0.3423655 ],
       [-0.14238724, -0.19999918, -0.31568187, ...,  0.04521384,
         0.71235317,  0.32038876],
       [-0.1278193 , -0.30846247, -0.2870111 , ...,  0.05664979,
         0.6538957 ,  0.28885946]], dtype=float32)

In [23]:
len(features)

4000

In [24]:
len(batch_1.label)

4000

In [25]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [26]:
features[0]

array([-8.56018364e-02, -3.10875997e-02, -1.23563275e-01, -1.64764561e-03,
       -1.10752076e-01, -2.03529522e-02, -4.21053916e-02,  1.50466204e-01,
       -2.66213417e-01,  1.57059312e-01, -3.13614815e-01, -1.22202292e-01,
       -9.91599867e-04,  1.49744004e-01, -5.18968821e-01, -6.91307262e-02,
        8.52116197e-03,  3.04158568e-01,  3.37418318e-01,  1.49457991e-01,
       -2.55234629e-01,  3.65912952e-02,  5.43671131e-01,  3.02406043e-01,
        1.00276008e-01, -4.15795743e-01,  3.66259038e-01,  1.96699798e-01,
       -2.49277025e-01, -1.38078883e-01,  1.68826848e-01,  5.70123851e-01,
       -1.21203668e-01, -1.16327599e-01,  2.31364489e-01, -1.87927261e-01,
        8.02928209e-02, -1.57011420e-01, -2.99619585e-01, -1.40291244e-01,
       -1.56974524e-01,  4.10617977e-01, -2.26694606e-02,  3.20510298e-01,
        1.83360264e-01, -8.48685205e-02, -3.25884628e+00, -1.48479730e-01,
       -2.92785704e-01, -2.89614499e-01,  4.09617603e-01,  8.40832144e-02,
        1.32868558e-01,  

In [27]:
tmp_df.sentence_embeddings.iloc[0]

[-0.08560183644294739,
 -0.03108759969472885,
 -0.12356327474117279,
 -0.0016476456075906754,
 -0.11075207591056824,
 -0.020352952182292938,
 -0.04210539162158966,
 0.1504662036895752,
 -0.26621341705322266,
 0.15705931186676025,
 -0.3136148154735565,
 -0.12220229208469391,
 -0.0009915998671203852,
 0.14974400401115417,
 -0.5189688205718994,
 -0.06913072615861893,
 0.00852116197347641,
 0.3041585683822632,
 0.3374183177947998,
 0.14945799112319946,
 -0.25523462891578674,
 0.0365912951529026,
 0.5436711311340332,
 0.30240604281425476,
 0.10027600824832916,
 -0.4157957434654236,
 0.36625903844833374,
 0.19669979810714722,
 -0.2492770254611969,
 -0.13807888329029083,
 0.1688268482685089,
 0.5701238512992859,
 -0.12120366841554642,
 -0.11632759869098663,
 0.23136448860168457,
 -0.1879272609949112,
 0.08029282093048096,
 -0.15701141953468323,
 -0.29961958527565,
 -0.1402912437915802,
 -0.15697452425956726,
 0.41061797738075256,
 -0.022669460624456406,
 0.32051029801368713,
 0.18336026370525

In [28]:
(tmp_df.sentence_embeddings.iloc[0] == features[0]).sum()

768

In [29]:
tmp_df.head(5)

Unnamed: 0,sentence_embeddings,label
8000,"[-0.08560183644294739, -0.03108759969472885, -...",1
8001,"[-0.16037434339523315, -0.1525372415781021, -0...",1
8002,"[-0.2068014144897461, -0.187772735953331, -0.1...",1
8003,"[-0.41091272234916687, -0.18082129955291748, -...",1
8004,"[-0.22654487192630768, -0.18598417937755585, -...",1


In [37]:
tmp_df.label.value_counts()

1    2000
0    2000
Name: label, dtype: int64

In [38]:
batch_1.label.value_counts()

1    2000
0    2000
Name: label, dtype: int64

In [39]:
import dill as pickle

In [None]:
import os

if not os.path.exists('features/'):
    os.mkdir('features/')

In [40]:
pickle.dump(tmp_df, open('features/reuters_sentence_embeddings_batch3', 'wb'))

In [34]:
12

12

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

In [None]:
len(features)

In [None]:
pickle.dump(features, open('features/reuters_sentence_embeddings_batch1', 'wb'))

In [None]:
ar = pickle.load(open('features/reuters_sentence_embeddings_batch1', 'rb'))
len(ar)

In [None]:
ar.head()

In [None]:
(features[0] == ar.sentence_embeddings[0]).sum()

In [None]:
set([len(i) for i in ar])

In [None]:
ar1 = pickle.load(open('features/essay_sentence_embeddings_batch2', 'rb'))

In [None]:
ar.shape, ar1.shape

In [None]:
pd.concat([ar, ar1]).label.value_counts()