In [1]:
import pandas as pd
import numpy as np
import torch

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
df = pd.read_csv('../../data/100_sentiment_analysis_sentences.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    100 non-null    object
 1   label   100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [6]:
df.label.unique()

array(['POSITIVE', 'NEGATIVE', 'NEUTRAL'], dtype=object)

In [7]:
# replacing values
df['label'].replace(['POSITIVE', 'NEGATIVE', 'NEUTRAL'],
                        [2, 0,1], inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    100 non-null    object
 1   label   100 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


In [9]:
from transformers import DistilBertModel,DistilBertTokenizer,AutoTokenizer
#The AutoTokenizer.from_pretrained method takes in the name of the model to build the appropriate tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
#Tokenization: This will return input ids:list of numbers,these numberrs are fetched from pretrained vocab
#attentioan mask: this is useful in batching to tell which tokens to attend to and ignore masking tokens
tokenized_text =tokenizer(list(df["text"]),padding=True,truncation=True,max_length=512,return_tensors='pt').to(device)

In [11]:
tokenized_text.keys()

dict_keys(['input_ids', 'attention_mask'])

In [14]:
outputs = model(**tokenized_text)
last_hidden_states = outputs.last_hidden_state

In [15]:
last_hidden_states.shape

torch.Size([100, 78, 768])

In [18]:
#But we need embedding of only classification token i.e of CLS token which is the first(0) token of every sentence.
X = last_hidden_states[:,0,:].detach().numpy()

In [19]:
X.shape

(100, 768)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
#Logistic Regression
y = torch.tensor(np.array(df.label.values))
X_train,X_test,y_train,y_test = train_test_split(X,y)
log_model = LogisticRegression(max_iter=1500,multi_class = 'multinomial')
log_model.fit(X_train,y_train)
preds = log_model.predict(X_test)
from sklearn import metrics
print(metrics.accuracy_score(y_test, preds))

0.48


In [22]:
import pickle
# save the model to disk
pickle.dump(log_model, open( '../models/embedding_custom_ml.pkl', 'wb'))

In [23]:
saved_model = pickle.load(open('../models/embedding_custom_ml.pkl', 'rb'))

In [26]:
#Inference Code
from transformers import DistilBertModel,AutoTokenizer
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

tokenized_text =tokenizer(["test_input"],padding=True,truncation=True,max_length=512,return_tensors='pt').to(device)
outputs = model(**tokenized_text)

test_X = outputs.last_hidden_state[:,0,:].detach().numpy()

new_prediction = saved_model.predict(test_X)
new_prediction

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


array([1], dtype=int64)