In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer

In [2]:
torch.cuda.empty_cache() 

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
device

'cuda'

In [5]:
data = pd.read_csv('Cleaned_Dataset.csv')

In [6]:
data = data[['Clean_Review', 'Sentiment']]

In [7]:
data

Unnamed: 0,Clean_Review,Sentiment
0,ambience good food quite good saturday lunch ...,1
1,ambience good pleasant even service prompt foo...,1
2,must try great food great ambience thnx servic...,1
3,soumen das arun great guy behavior sincerety g...,1
4,food goodwe order kodi drumstick basket mutton...,1
...,...,...
9935,madhumathi mahajan well start nice courteous s...,0
9936,place never disappoint u food courteous staff ...,1
9937,bad rating mainly chicken bone find veg food a...,0
9938,personally love prefer chinese food couple tim...,1


In [9]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [10]:
tokenizer

RobertaTokenizer(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [12]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Clean_Review
        self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [13]:
from gensim.utils import simple_preprocess

In [None]:
# data['tokens'] = [simple_preprocess(line, deacc=True) for line in data['Clean_Review']]

In [15]:
train_data = data.sample(frac=0.8,random_state=200)
test_data = data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [16]:
print("TRAIN Dataset: {}".format(train_data.shape))

TRAIN Dataset: (7952, 2)


In [None]:
data

Unnamed: 0,Clean_Review,Sentiment
0,ambience good food quite good saturday lunch ...,1
1,ambience good pleasant even service prompt foo...,1
2,must try great food great ambience thnx servic...,1
3,soumen das arun great guy behavior sincerety g...,1
4,food goodwe order kodi drumstick basket mutton...,1
...,...,...
9935,madhumathi mahajan well start nice courteous s...,0
9936,place never disappoint u food courteous staff ...,1
9937,bad rating mainly chicken bone find veg food a...,0
9938,personally love prefer chinese food couple tim...,1


In [17]:
training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

In [18]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [19]:
class Roberta(torch.nn.Module):
    def __init__(self):
        super(Roberta, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [20]:
model = Roberta()
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Roberta(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

In [22]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [24]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += (big_idx==targets).sum().item()

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%100==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 100 steps: {loss_step}")
            print(f"Training Accuracy per 100 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [25]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training Loss per 100 steps: 1.5774364471435547
Training Accuracy per 100 steps: 37.5


101it [00:29,  3.72it/s]

Training Loss per 100 steps: 0.8575235212793445
Training Accuracy per 100 steps: 66.46039603960396


201it [00:55,  3.78it/s]

Training Loss per 100 steps: 0.6496768714257734
Training Accuracy per 100 steps: 75.06218905472637


301it [01:22,  3.75it/s]

Training Loss per 100 steps: 0.5634116776708352
Training Accuracy per 100 steps: 77.74086378737542


401it [01:49,  3.75it/s]

Training Loss per 100 steps: 0.5151215187109022
Training Accuracy per 100 steps: 79.51995012468828


501it [02:16,  3.57it/s]

Training Loss per 100 steps: 0.4881522757728062
Training Accuracy per 100 steps: 80.56387225548902


601it [02:44,  3.67it/s]

Training Loss per 100 steps: 0.4615272559758629
Training Accuracy per 100 steps: 81.65557404326123


701it [03:11,  3.69it/s]

Training Loss per 100 steps: 0.4414304772899426
Training Accuracy per 100 steps: 82.68544935805991


801it [03:39,  3.65it/s]

Training Loss per 100 steps: 0.42611789947536105
Training Accuracy per 100 steps: 83.16167290886392


901it [04:06,  3.77it/s]

Training Loss per 100 steps: 0.40950146455991676
Training Accuracy per 100 steps: 83.87902330743619


994it [04:31,  3.66it/s]


The Total Accuracy for Epoch 0: 84.39386317907444
Training Loss Epoch: 0.3979679220609022
Training Accuracy Epoch: 84.39386317907444


0it [00:00, ?it/s]

Training Loss per 100 steps: 0.3846559524536133
Training Accuracy per 100 steps: 75.0


101it [00:27,  3.71it/s]

Training Loss per 100 steps: 0.2381000254893362
Training Accuracy per 100 steps: 91.2128712871287


201it [00:54,  3.75it/s]

Training Loss per 100 steps: 0.25000391478895845
Training Accuracy per 100 steps: 90.48507462686567


301it [01:21,  3.57it/s]

Training Loss per 100 steps: 0.25189388427648235
Training Accuracy per 100 steps: 90.36544850498339


401it [01:49,  3.76it/s]

Training Loss per 100 steps: 0.26513556513164255
Training Accuracy per 100 steps: 89.77556109725685


501it [02:16,  3.76it/s]

Training Loss per 100 steps: 0.2641678085130548
Training Accuracy per 100 steps: 89.77045908183632


601it [02:43,  3.75it/s]

Training Loss per 100 steps: 0.2630846367625151
Training Accuracy per 100 steps: 89.82945091514144


701it [03:10,  3.75it/s]

Training Loss per 100 steps: 0.26415308254283354
Training Accuracy per 100 steps: 89.78245363766048


801it [03:37,  3.71it/s]

Training Loss per 100 steps: 0.26595192030304754
Training Accuracy per 100 steps: 89.57553058676655


901it [04:04,  3.73it/s]

Training Loss per 100 steps: 0.2614414193962063
Training Accuracy per 100 steps: 89.67813540510544


994it [04:29,  3.69it/s]


The Total Accuracy for Epoch 1: 89.54979879275653
Training Loss Epoch: 0.26270513609145396
Training Accuracy Epoch: 89.54979879275653


1it [00:00,  4.05it/s]

Training Loss per 100 steps: 0.13312318921089172
Training Accuracy per 100 steps: 100.0


101it [00:26,  3.76it/s]

Training Loss per 100 steps: 0.22541144341783653
Training Accuracy per 100 steps: 91.58415841584159


201it [00:53,  3.70it/s]

Training Loss per 100 steps: 0.20380686921303842
Training Accuracy per 100 steps: 92.03980099502488


301it [01:20,  3.71it/s]

Training Loss per 100 steps: 0.21703503227040816
Training Accuracy per 100 steps: 91.69435215946844


401it [01:47,  3.66it/s]

Training Loss per 100 steps: 0.2140179878179718
Training Accuracy per 100 steps: 92.08229426433915


501it [02:14,  3.72it/s]

Training Loss per 100 steps: 0.217386516008132
Training Accuracy per 100 steps: 91.89121756487026


601it [02:40,  3.75it/s]

Training Loss per 100 steps: 0.2149155448669701
Training Accuracy per 100 steps: 91.97171381031615


701it [03:07,  3.76it/s]

Training Loss per 100 steps: 0.21397727461911728
Training Accuracy per 100 steps: 91.92225392296719


801it [03:34,  3.73it/s]

Training Loss per 100 steps: 0.2160810652455215
Training Accuracy per 100 steps: 91.71348314606742


901it [04:01,  3.75it/s]

Training Loss per 100 steps: 0.21345038790065593
Training Accuracy per 100 steps: 91.70366259711432


994it [04:25,  3.74it/s]


The Total Accuracy for Epoch 2: 91.75050301810865
Training Loss Epoch: 0.2131174671551741
Training Accuracy Epoch: 91.75050301810865


1it [00:00,  3.68it/s]

Training Loss per 100 steps: 0.44928377866744995
Training Accuracy per 100 steps: 87.5


101it [00:26,  3.75it/s]

Training Loss per 100 steps: 0.19210820503090278
Training Accuracy per 100 steps: 92.57425742574257


201it [00:53,  3.75it/s]

Training Loss per 100 steps: 0.17987283540945223
Training Accuracy per 100 steps: 92.9726368159204


301it [01:20,  3.75it/s]

Training Loss per 100 steps: 0.1742670865005433
Training Accuracy per 100 steps: 93.23089700996678


401it [01:47,  3.71it/s]

Training Loss per 100 steps: 0.17458879341489805
Training Accuracy per 100 steps: 93.17331670822942


501it [02:13,  3.75it/s]

Training Loss per 100 steps: 0.1744034707364059
Training Accuracy per 100 steps: 93.18862275449102


601it [02:40,  3.73it/s]

Training Loss per 100 steps: 0.173076239075146
Training Accuracy per 100 steps: 93.2820299500832


701it [03:07,  3.66it/s]

Training Loss per 100 steps: 0.16914669185619105
Training Accuracy per 100 steps: 93.52710413694722


801it [03:34,  3.77it/s]

Training Loss per 100 steps: 0.16890111071268066
Training Accuracy per 100 steps: 93.60174781523096


901it [04:00,  3.75it/s]

Training Loss per 100 steps: 0.16993978653376
Training Accuracy per 100 steps: 93.5488346281909


994it [04:25,  3.74it/s]


The Total Accuracy for Epoch 3: 93.6996981891348
Training Loss Epoch: 0.16720774766503715
Training Accuracy Epoch: 93.6996981891348


1it [00:00,  3.82it/s]

Training Loss per 100 steps: 0.03857971355319023
Training Accuracy per 100 steps: 100.0


101it [00:26,  3.75it/s]

Training Loss per 100 steps: 0.10482247743183876
Training Accuracy per 100 steps: 96.16336633663366


201it [00:53,  3.77it/s]

Training Loss per 100 steps: 0.12489253081223783
Training Accuracy per 100 steps: 95.58457711442786


301it [01:20,  3.75it/s]

Training Loss per 100 steps: 0.14390793647709924
Training Accuracy per 100 steps: 94.72591362126246


401it [01:47,  3.75it/s]

Training Loss per 100 steps: 0.14333654280770525
Training Accuracy per 100 steps: 94.9501246882793


501it [02:13,  3.76it/s]

Training Loss per 100 steps: 0.14331463450926565
Training Accuracy per 100 steps: 94.83532934131736


601it [02:40,  3.75it/s]

Training Loss per 100 steps: 0.14127408694778065
Training Accuracy per 100 steps: 94.9251247920133


701it [03:07,  3.74it/s]

Training Loss per 100 steps: 0.14172709662535
Training Accuracy per 100 steps: 94.97146932952924


801it [03:34,  3.74it/s]

Training Loss per 100 steps: 0.1392734068981866
Training Accuracy per 100 steps: 95.0374531835206


901it [04:00,  3.73it/s]

Training Loss per 100 steps: 0.13716450386104917
Training Accuracy per 100 steps: 95.10266370699223


994it [04:25,  3.74it/s]

The Total Accuracy for Epoch 4: 95.07042253521126
Training Loss Epoch: 0.13789223515526045
Training Accuracy Epoch: 95.07042253521126





In [26]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += (big_idx==targets).sum().item()

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%100==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return

In [27]:
valid(model, testing_loader)

4it [00:00, 12.90it/s]

Validation Loss per 100 steps: 0.8506279587745667
Validation Accuracy per 100 steps: 75.0


106it [00:04, 27.08it/s]

Validation Loss per 100 steps: 0.28261003394767936
Validation Accuracy per 100 steps: 91.08910891089108


205it [00:07, 27.27it/s]

Validation Loss per 100 steps: 0.3381274522303034
Validation Accuracy per 100 steps: 88.93034825870647


304it [00:11, 27.21it/s]

Validation Loss per 100 steps: 0.319114068169549
Validation Accuracy per 100 steps: 89.53488372093024


406it [00:15, 27.19it/s]

Validation Loss per 100 steps: 0.33084481287958967
Validation Accuracy per 100 steps: 88.90274314214464


497it [00:18, 26.78it/s]

Validation Loss Epoch: 0.3584785634671456
Validation Accuracy Epoch: 88.38028169014085





In [28]:
output_model_file = 'roberta_sentiment.bin'
output_vocab_file = './'

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

('./vocab.json', './merges.txt')

In [49]:
model.state_dict()

OrderedDict([('l1.embeddings.word_embeddings.weight',
              tensor([[ 0.1465, -0.0363,  0.0749,  ..., -0.0015,  0.0176, -0.0022],
                      [ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
                      [-0.0341, -0.0861, -0.0178,  ...,  0.1174, -0.0100, -0.0364],
                      ...,
                      [ 0.0304,  0.0504, -0.0307,  ...,  0.0377,  0.0096,  0.0084],
                      [ 0.0623, -0.0596,  0.0307,  ..., -0.0920,  0.1080, -0.0183],
                      [ 0.1259, -0.0145,  0.0332,  ...,  0.0121,  0.0342,  0.0168]],
                     device='cuda:0')),
             ('l1.embeddings.position_embeddings.weight',
              tensor([[-0.0115,  0.0204,  0.0197,  ...,  0.0050, -0.0274, -0.0439],
                      [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                      [ 0.0336, -0.0168, -0.0900,  ..., -0.0534,  0.0295,  0.0167],
                      ...,
                      [ 0.1191, -0.05

In [53]:
model_dir = "RoBERTa_weights"
model_path = f"{model_dir}/model.pth"
config_path = f"{model_dir}/config.json"
tokenizer_path = f"{model_dir}/tokenizer"

torch.save(model.state_dict(), model_path)
config = model.l1.config
config.save_pretrained(model_dir)
tokenizer.save_pretrained(tokenizer_path)

('RoBERTa_weights/tokenizer\\tokenizer_config.json',
 'RoBERTa_weights/tokenizer\\special_tokens_map.json',
 'RoBERTa_weights/tokenizer\\vocab.json',
 'RoBERTa_weights/tokenizer\\merges.txt',
 'RoBERTa_weights/tokenizer\\added_tokens.json')

In [50]:
model.l1.config

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.38.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [55]:
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer

In [56]:
config = RobertaConfig.from_pretrained(model_dir)

In [58]:
model1 = Roberta()

model1.load_state_dict(torch.load(model_path))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [60]:
tokenizer1 = RobertaTokenizer.from_pretrained(tokenizer_path)

In [62]:
model1.eval()

Roberta(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

In [104]:
text = 'decent restaurant bad potato'

In [79]:
inputs = tokenizer(text, return_tensors="pt", truncation=True)


tokenizer3 = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)


NameError: name 'BertTokenizer' is not defined

In [92]:
tokenizer(text)

{'input_ids': [0, 424, 22736, 2391, 38233, 372, 544, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [44]:
outputs = model(**inputs)

TypeError: forward() missing 1 required positional argument: 'token_type_ids'

In [70]:
inputs

{'input_ids': tensor([[    0,   424, 22736,  2391, 38233,   372,   544,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [82]:
tokenizerr = RobertaTokenizer('RoBERTa_Weights/tokenizer/vocab.json', 'RoBERTa_Weights/tokenizer/merges.txt')

In [84]:
tokenizerr(text, return_tensors="pt", truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': tensor([[    0,   424, 22736,  2391, 38233,   372,   544,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [111]:
inputs = tokenizerr(text, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

with torch.no_grad():
    outputs = model1(input_ids, attention_mask, None)

In [112]:
torch.nn.functional.softmax(outputs, dim=-1)

tensor([[9.9653e-01, 3.3537e-03, 3.9902e-05, 3.8691e-05, 3.9284e-05]])

In [113]:
# Compute softmax probabilities
probs = torch.nn.functional.softmax(outputs, dim=-1)

# Get predicted sentiment label
predicted_label = torch.argmax(probs, dim=-1).item()

# Define sentiment labels
sentiment_labels = ["negative", "positive"]

# Map predicted label to sentiment
predicted_sentiment = sentiment_labels[predicted_label]

In [114]:
predicted_label

0

In [115]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import random

comet_ml is installed but `COMET_API_KEY` is not set.


In [125]:
# sentence_bow = vectorizer.fit_transform(data['Clean_Review'])
x_train, x_test, y_train, y_test = train_test_split(data['Clean_Review'], data['Sentiment'], test_size=0.2, random_state=42)

In [126]:
x_train

715     tried 5 high rating shawarma place hyderabad b...
809     good taste food nice ambiencestaff really frie...
3205    look north indian food must say liked alot amb...
7532    order special chicken biryani awsomeand good c...
6435                                        fast delivery
                              ...                        
5734                                      awesome service
5191                            reach delivery time thanq
5390        superb taste little delay get product deliver
860     amaze ipl offer600 4 pint  french friesnice am...
7270    absolutely delighted overall experience shop e...
Name: Clean_Review, Length: 7952, dtype: object

In [127]:
train_texts, eval_texts = list(x_train), list(x_test)

In [128]:
train_texts

['tried 5 high rating shawarma place hyderabad best  fill place simply amaze special rumali best',
 'good taste food nice ambiencestaff really friendlysandipmusic really good atmosphere energeticits good place njoy friend family',
 'look north indian food must say liked alot ambience good service good regular place go place would say know miss place try schezwan fry rice kadai paneer roti friend plan try buffet next time 45 side ❤😊',
 'order special chicken biryani awsomeand good crevice tasty good ambience 35 food 55 service 55 expense good',
 'fast delivery',
 'pathetic restuarant struggle even get table pay cover charge struggle mess everything order chilly paneer get chilly chicken mean restuarant ask manager atleast 3 4 time get every order take 45 minute get order crowd space dance people dance near table drink spell 3 4 time complain nobody even bother check last saw bill almost 3 4 item include never order horrible experience waste money time energy',
 'visited place saturday e

In [122]:
class CustomDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.texts[idx]  # Use the same text as input and target for self-supervised learning

In [131]:
def fine_tune_bart(train_texts, eval_texts):
    # Load BART tokenizer and model
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

    # Tokenize texts
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    eval_encodings = tokenizer(eval_texts, truncation=True, padding=True)

    # Create datasets
    train_dataset = CustomDataset(train_encodings)
    eval_dataset = CustomDataset(eval_encodings)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        logging_dir='./logs',
        logging_steps=100,
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Train the model
    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained("./fine_tuned_bart")


In [132]:
fine_tune_bart(train_texts, eval_texts)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)), '(Request ID: a81cdc23-a7ef-4ce6-ad52-bcdd1e77c0a0)')