In [None]:
!pip install transformers sentence-transformers

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_reuter_data(verbose=False, words_count=500):
    PATH = "drive/MyDrive/Fall_2023_Project/ghostbuster/"
    PATH = ""
    REUTER_DATA_PATH = PATH + "reuter/data/"
    required_data = ["gpt", "human", "claude"]
    SKIP = ["logprobs", "headlines"]
    SPLITS = ["test", "train"]
    docs, names, labels = [], [], []

    for split in required_data:
        for test_train in SPLITS:
            authors_path = os.path.join(REUTER_DATA_PATH, split, test_train)
            for author in os.listdir(authors_path):
                articles_path = os.path.join(authors_path, author)
                for article in os.listdir(articles_path):

                    if article in SKIP:
                        continue
                        
                    filepath = os.path.join(articles_path, article)
                    with open(filepath, encoding="utf8") as f:
                        doc = f.read()

                    doc = doc.split()[:words_count]
                    doc = " ".join(doc)

                    filepath = filepath.replace(PATH, "")

                    label = 0
                    if split == "human":
                        label = 1

                    if verbose:
                        print(filepath)

                    names.append(filepath)
                    docs.append(doc)
                    labels.append(label)

    return {
      "names": names,
      "docs": docs,
      "labels": labels
  }


In [3]:
import pandas as pd

res = get_reuter_data()

df = pd.DataFrame({
    "txt": res["docs"],
    "filename": res["names"],
    "label": res["labels"]
})
df.shape

(15000, 3)

In [4]:
df.head()

Unnamed: 0,txt,filename,label
0,A new rule introduced by the U.S. Securities a...,reuter/data/gpt\test\AaronPressman\0.txt,0
1,Washington D.C. - Several members of Congress ...,reuter/data/gpt\test\AaronPressman\1.txt,0
2,Privacy advocates are warning that the Clinton...,reuter/data/gpt\test\AaronPressman\10.txt,0
3,A recent survey conducted by Bankrate has foun...,reuter/data/gpt\test\AaronPressman\11.txt,0
4,A recent report from banking watchdog consumer...,reuter/data/gpt\test\AaronPressman\12.txt,0


In [5]:
df.label.value_counts()

0    10000
1     5000
Name: label, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
batch_1 = df[:4000]

In [8]:
#df['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [9]:
tokenized = batch_1['txt'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
tokenized.shape, len(tokenized)

((4000,), 4000)

In [11]:
max_len = max([len(i) for i in tokenized.values])
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [12]:
np.array(padded).shape

(4000, 512)

In [13]:
padded

array([[  101,  1037,  2047, ...,  1011,  5198,   102],
       [  101,  2899,  1040, ...,     0,     0,     0],
       [  101,  9394, 13010, ..., 14037,  2007,   102],
       ...,
       [  101,  5126,  1997, ...,  1010,  2004,   102],
       [  101,  2605, 18126, ...,     0,     0,     0],
       [  101,  4830,  7974, ...,  1996,  2142,   102]])

In [14]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(4000, 512)

In [15]:
attention_mask[:5]

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]])

In [16]:
from datetime import datetime


start = datetime.now()
input_ids = torch.tensor(padded)
attention_masks = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)
end = datetime.now()
end - start

datetime.timedelta(seconds=1779, microseconds=560639)

In [None]:
1590/60

In [17]:
features = last_hidden_states[0][:,0,:].numpy()

In [18]:
features

array([[-0.3023219 , -0.16762316, -0.3572529 , ...,  0.04004054,
         0.25119025,  0.53884244],
       [-0.37213305, -0.32755706, -0.63955176, ...,  0.0159316 ,
         0.16204335,  0.4484262 ],
       [-0.33801702, -0.11618967, -0.55534786, ..., -0.10749807,
         0.08649145,  0.54120106],
       ...,
       [-0.09950317, -0.11830468,  0.01537717, ..., -0.02761077,
         0.52238315,  0.4566367 ],
       [ 0.04583343, -0.08752509, -0.116282  , ..., -0.08031911,
         0.594171  ,  0.2442167 ],
       [-0.1463314 ,  0.04806804, -0.10895059, ...,  0.02753123,
         0.5267255 ,  0.23940928]], dtype=float32)

In [19]:
len(features)

4000

In [41]:
len(batch_1.label)

4000

In [56]:
tmp_df = pd.DataFrame({
    'sentence_embeddings': features.tolist(),
    'label': batch_1.label
})

In [49]:
features[0]

array([-3.02321911e-01, -1.67623162e-01, -3.57252896e-01, -7.64605552e-02,
        1.86211422e-01, -3.42676133e-01, -7.59676471e-02, -1.51831403e-01,
       -1.07466139e-01, -1.02297135e-01, -6.02367856e-02,  6.79232851e-02,
       -1.88802317e-01,  2.32279450e-01, -2.77527303e-01,  6.11509159e-02,
        1.73177883e-01,  2.86922067e-01,  4.52198386e-01,  4.72785644e-02,
       -2.41215587e-01, -1.02100462e-01,  3.37319553e-01,  1.99585974e-01,
        1.28177240e-01, -1.50272265e-01, -1.18302390e-01, -2.29578644e-01,
       -5.19373789e-02,  2.50505120e-01,  5.28746605e-01,  4.71413493e-01,
       -4.56330836e-01, -3.37919116e-01,  3.66546392e-01,  6.94881380e-02,
        9.53440070e-02, -4.58454549e-01, -2.76674479e-01,  2.80852526e-01,
       -3.89307626e-02,  3.79111290e-01, -5.29053435e-02,  7.64538720e-02,
        2.35615671e-03,  1.59636140e-01, -3.40845084e+00, -1.17595434e-01,
       -9.66218710e-02, -2.57731408e-01,  2.78488755e-01, -1.36756524e-02,
       -1.96675181e-01,  

In [48]:
tmp_df.sentence_embeddings[0]

[-0.3023219108581543,
 -0.16762316226959229,
 -0.35725289583206177,
 -0.07646055519580841,
 0.18621142208576202,
 -0.3426761329174042,
 -0.07596764713525772,
 -0.15183140337467194,
 -0.10746613889932632,
 -0.10229713469743729,
 -0.06023678556084633,
 0.06792328506708145,
 -0.1888023167848587,
 0.2322794497013092,
 -0.2775273025035858,
 0.061150915920734406,
 0.17317788302898407,
 0.28692206740379333,
 0.4521983861923218,
 0.04727856442332268,
 -0.24121558666229248,
 -0.10210046172142029,
 0.337319552898407,
 0.19958597421646118,
 0.1281772404909134,
 -0.15027226507663727,
 -0.11830238997936249,
 -0.2295786440372467,
 -0.05193737894296646,
 0.25050511956214905,
 0.5287466049194336,
 0.4714134931564331,
 -0.4563308358192444,
 -0.33791911602020264,
 0.3665463924407959,
 0.06948813796043396,
 0.09534400701522827,
 -0.45845454931259155,
 -0.2766744792461395,
 0.2808525264263153,
 -0.03893076255917549,
 0.37911128997802734,
 -0.05290534347295761,
 0.07645387202501297,
 0.0023561567068099976,

In [57]:
(tmp_df.sentence_embeddings[0] == features[0]).sum()

768

In [59]:
tmp_df.head(5)

Unnamed: 0,sentence_embeddings,label
0,"[-0.3023219108581543, -0.16762316226959229, -0...",0
1,"[-0.3721330463886261, -0.3275570571422577, -0....",0
2,"[-0.3380170166492462, -0.11618966609239578, -0...",0
3,"[-0.25932276248931885, -0.1234612688422203, -0...",0
4,"[-0.37114599347114563, -0.14960415661334991, -...",0


In [60]:
import dill as pickle

In [61]:
import os

if not os.path.exists('features/'):
    os.mkdir('features/')

In [62]:
pickle.dump(tmp_df, open('features/reuters_sentence_embeddings_batch1', 'wb'))

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features

In [None]:
len(features)

In [None]:
pickle.dump(features, open('features/reuters_sentence_embeddings_batch1', 'wb'))

In [63]:
ar = pickle.load(open('features/reuters_sentence_embeddings_batch1', 'rb'))
len(ar)

4000

In [64]:
ar.head()

Unnamed: 0,sentence_embeddings,label
0,"[-0.3023219108581543, -0.16762316226959229, -0...",0
1,"[-0.3721330463886261, -0.3275570571422577, -0....",0
2,"[-0.3380170166492462, -0.11618966609239578, -0...",0
3,"[-0.25932276248931885, -0.1234612688422203, -0...",0
4,"[-0.37114599347114563, -0.14960415661334991, -...",0


In [67]:
(features[0] == ar.sentence_embeddings[0]).sum()

768

In [None]:
set([len(i) for i in ar])

In [72]:
ar1 = pickle.load(open('features/essay_sentence_embeddings_batch2', 'rb'))

In [73]:
ar.shape, ar1.shape

((4000, 2), (332, 2))

In [75]:
pd.concat([ar, ar1]).label.value_counts()

0    4332
Name: label, dtype: int64