In [1]:
import pandas as pd


In [2]:
import os
import random
import numpy as np
import torch

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'


torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [3]:
df = pd.read_csv('../../data/100_sentiment_analysis_sentences.csv')

In [4]:
# replacing values
df['label'].replace(['POSITIVE', 'NEGATIVE', 'NEUTRAL'],
                        [2, 0,1], inplace=True)

In [5]:
from transformers import DistilBertForSequenceClassification,AutoTokenizer
#The AutoTokenizer.from_pretrained method takes in the name of the model to build the appropriate tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

#We can  ask the model to return all hidden states and all attention weights if we need them:output_hidden_states=True, output_attentions=True
#But in this we don't need them
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=3)
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [6]:
#Tokenization: This will return input ids:list of numbers,these numberrs are fetched from pretrained vocab
tokenized_text =tokenizer(list(df["text"]),padding=True,truncation=True,max_length=768)

In [7]:
tokenized_text.keys()

dict_keys(['input_ids', 'attention_mask'])

In [8]:
input_ids = torch.tensor(tokenized_text['input_ids'])

In [9]:
#Dataset preparation
from torch.utils.data import Dataset, TensorDataset,DataLoader
from sklearn.model_selection import train_test_split


X = input_ids
y = torch.tensor(np.array(df.label.values))

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)

train_data = TensorDataset(X_train, y_train)

train_loader = DataLoader(train_data,batch_size=16, shuffle=True)

In [10]:
y_train.shape

torch.Size([80])

In [12]:
#Model training
NUM_EPOCHS = 1
LEARNING_RATE = 0.01
optimizer =torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE) 
loss_fn = torch.nn.CrossEntropyLoss()

for i in range(NUM_EPOCHS):
  model.train()
  for X_batch,y_batch in train_loader:
    output = model(X_batch)
    loss = loss_fn(output.logits,y_batch)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

In [13]:
model.save_pretrained("../models/embedding_custom_dl")

In [14]:
saved_model = DistilBertForSequenceClassification.from_pretrained("../models/embedding_custom_dl")

In [15]:
#Inference Code
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

tokenized_text =tokenizer(["test_input"],padding=True,truncation=True,max_length=512,return_tensors='pt').to(device)
input_ids = torch.tensor(tokenized_text['input_ids']).to(device)


with torch.no_grad():
    outputs = saved_model(input_ids)

print(outputs)
print(outputs.logits)
predicted_class_id = outputs.logits.argmax().item()
print(predicted_class_id)

SequenceClassifierOutput(loss=None, logits=tensor([[-6.7669, -3.6255, 23.4835]]), hidden_states=None, attentions=None)
tensor([[-6.7669, -3.6255, 23.4835]])
2


  input_ids = torch.tensor(tokenized_text['input_ids']).to(device)
