In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score
from tabulate import tabulate


categories = ['rec.autos', 'comp.graphics', 'sci.space']
newsgroup = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def train(x_train, x_test, y_train, y_test, clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    return f1_score(y_test, y_pred, average='weighted')

In [13]:
bertMap = ["base-uncased", "large-uncased", "roberta"]
clfMap = ["RndmForset", "GBM", "ADA"]
table = [[" "] + clfMap]

max_length = 512
for i in range(len(bertMap)):
    if i == 0:
        model_name = 'bert-base-uncased'
        tokenizer = BertTokenizer.from_pretrained(model_name, padding='max_length', truncation=True, max_length=max_length)
        model = BertModel.from_pretrained(model_name)
    elif i == 1:
        model_name = 'bert-large-uncased'
        tokenizer = BertTokenizer.from_pretrained(model_name, padding='max_length', truncation=True, max_length=max_length)
        model = BertModel.from_pretrained(model_name)
    elif i == 2:
        model_name = 'roberta-base'
        tokenizer = RobertaTokenizer.from_pretrained(model_name, padding='max_length', truncation=True, max_length=max_length)
        model = RobertaModel.from_pretrained(model_name)

    tokenized_texts = []
    for text in newsgroup.data:  # Assuming newsgroup.data contains text data
        tokens = tokenizer.encode(text, max_length=max_length, truncation=True)
        tokenized_texts.append(tokens)

    # Pad tokenized sequences to a fixed length with 0
    max_seq_len = max(len(tokens) for tokens in tokenized_texts)
    padded_input_ids = [tokens + [0]*(max_seq_len - len(tokens)) for tokens in tokenized_texts]

    input_ids = torch.tensor(padded_input_ids)
    attention_masks = torch.ones_like(input_ids)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)

    last_hidden_states = outputs.last_hidden_state
    embeddings = last_hidden_states[:, 0, :]

    dataVec = embeddings.numpy()
    X_train, X_test, y_train, y_test = train_test_split(dataVec, newsgroup.target, test_size=0.33)
    smallArr = [bertMap[i]]
    for j in range(len(clfMap)):
        clf = 0
        if j == 0:
            clf = RandomForestClassifier()
        if j == 1:
            clf = GradientBoostingClassifier(n_estimators=125)
        if j == 2:
            clf = AdaBoostClassifier(n_estimators=200, algorithm='SAMME')
        f1 = train(X_train, X_test, y_train, y_test, clf)
        smallArr.append(f1)
    table.append(smallArr)
print(tabulate(table))

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 37119590400 bytes.