<a href="https://colab.research.google.com/github/smslca/Huggigfaces/blob/main/Text_Classification_using_HuggingFaces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers datasets umap-learn

In [None]:
from datasets import load_dataset

In [None]:
emotions = load_dataset("emotion")
emotions

In [None]:
train_ds = emotions['train']
train_ds

In [None]:
len(train_ds)

In [None]:
train_ds.column_names

In [None]:
train_ds[0]

In [None]:
print(train_ds.features)

In [None]:
import pandas as pd
emotions.set_format("pandas")
df = emotions['train'][:]
df.head()

In [None]:
def label_int2str(row):
  return emotions['train'].features['label'].int2str(row)

df['label_str'] = df['label'].apply(label_int2str)

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt
df.label_str.value_counts(ascending=True).plot.bar()
plt.title('Frequency of classes')
plt.xlabel("emotion")
plt.show()

In [None]:
df['Words per tweet'] = df['text'].str.split(' ').apply(len)
df.boxplot("Words per tweet",by='label_str',grid=False,showfliers=False,color='black')
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
emotions.reset_format()

In [None]:
from transformers import AutoTokenizer
model_check_point = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_check_point)

In [None]:
text = "Tokenizing text is a core task of NLP."
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
tokenizer.convert_tokens_to_string(tokens)

In [None]:
tokenizer.convert_tokens_to_ids(tokens)

In [None]:
print(tokenizer.vocab_size)
print(tokenizer.model_max_length)
print(tokenizer.model_input_names)

In [None]:
def tokenize(batch):
  return tokenizer(batch['text'],padding=True,truncation=True)

In [None]:
tokenize(emotions['train'][:2])

In [None]:
emotions_encoded = emotions.map(tokenize,batched=True,batch_size=None)

In [None]:
emotions_encoded['train'][0]

In [None]:
print(emotions_encoded['train'].column_names)

In [None]:
from transformers import AutoModel
import torch
model_check_point = 'distilbert-base-uncased'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModel.from_pretrained(model_check_point).to(device)

In [None]:

text = "this is a test"
inputs = tokenizer(text,return_tensors='pt')
print(f"Input tensors shape: {inputs['input_ids'].size()}")

In [None]:
inputs

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
  outputs = model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size()

In [None]:
def extract_hidden_states(batch):
  inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
  with torch.no_grad():
    last_hidden_state = model(**inputs).last_hidden_state
  return {"hidden_state": last_hidden_state[:,0].cpu().numpy() }

In [None]:
emotions_encoded.set_format("torch",columns=['input_ids','attention_mask','label'])

In [None]:
emotions_hidden = emotions_encoded.map(extract_hidden_states,batched=True)

In [None]:
emotions_hidden

In [None]:
import numpy as np
X_train = np.array(emotions_hidden['train']['hidden_state'])
y_train = np.array(emotions_hidden['train']['label'])
X_valid = np.array(emotions_hidden['validation']['hidden_state'])
y_valid = np.array(emotions_hidden['validation']['label'])
X_test  = np.array(emotions_hidden['test']['hidden_state'])
y_test  = np.array(emotions_hidden['test']['label'])
X_train.shape,X_valid.shape,X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train,y_train)
lr_clf.score(X_valid,y_valid)

In [None]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train,y_train)
dummy_clf.score(X_valid,y_valid)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
def plot_confusion_matrix(y_preds,y_true,labels):
  cm = confusion_matrix(y_true,y_preds,normalize='true')
  fig, ax = plt.subplots(figsize=(6,6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=labels)
  disp.plot(cmap="Blues",values_format=".2f",ax=ax,colorbar=False)
  plt.title('Normalized Confusion Matrix')
  plt.show()



In [None]:
labels =  emotions['train'].features['label'].names
labels

In [None]:
y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds,y_valid,labels)

In [None]:
from transformers import AutoModelForSequenceClassification
import torch
num_labels = 6
model_check_point = 'distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(model_check_point,num_labels=num_labels).to(device)


In [None]:
from sklearn.metrics import f1_score,accuracy_score
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels,preds,average="weighted")
  acc = accuracy_score(labels,preds)
  return {"accuracy":acc,"f1":f1}

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
import os
os.environ[“WANDB_DISABLED”] = “true”

In [None]:
args = TrainingArguments(report_to=None)

In [None]:
batch_size = 64
logging_steps = len(emotions_encoded['train'])//batch_size
model_name = f'{model_check_point}-praveen-emotions-finetuned'
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs = 2,
                                  learning_rate = 2e-5,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  weight_decay = 0.01,
                                  evaluation_strategy = 'epoch',
                                  disable_tqdm = False,
                                  logging_steps = logging_steps,
                                  push_to_hub = True,
                                  log_level='error'
                                  )

In [None]:
trainer = Trainer(model=model,args=training_args,compute_metrics=compute_metrics,train_dataset = emotions_encoded['train'],
                  eval_dataset=emotions_encoded['validation'],tokenizer=tokenizer)
trainer.train()

In [None]:
preds_output = trainer.predict(emotions_encoded['validation'])

In [None]:
preds_output.metrics

In [None]:
y_preds = np.argmax(preds_output.predictions,axis=1)

In [None]:
plot_confusion_matrix(y_preds,y_valid,labels)

In [None]:
from torch.nn.functional import cross_entropy

In [None]:
def forward_pass_with_label(batch):
  inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
  with torch.no_grad():
    output = model(**inputs)
    pred_label = torch.argmax(output.logits,axis=-1)
    loss = cross_entropy(output.logits,batch['label'].to(device),reduction='none')
  return {"loss": loss.cpu().numpy(),
          "predicted_label":pred_label.cpu().numpy()}

In [None]:
emotions_encoded.set_format("torch",columns=['input_ids','attention_mask','label'])
emotions_encoded['validation'] = emotions_encoded['validation'].map(forward_pass_with_label,batched=True,batch_size=16)


In [None]:
emotions_encoded.set_format('pandas')
cols = ['text','label','predicted_label','loss']
df_test = emotions_encoded['validation'][:][cols]
df_test['label'] = df_test['label'].apply(label_int2str)
df_test['predicted_label'] = df_test['predicted_label'].apply(label_int2str)


In [None]:
df_test.sort_values("loss",ascending=False).head(10)

In [None]:
trainer.push_to_hub(commit_message="Training Completed!!!")

In [None]:
from transformers import pipeline

In [None]:
model_id = 'smslca/distilbert-base-uncased-praveen-emotions-finetuned'
classifier = pipeline('text-classification',model=model_id)
tweet = "I saw a horrible movie today"
preds = classifier(tweet,return_all_scores=True)


In [None]:
import pandas as pd
preds_df = pd.DataFrame(preds[0])
plt.bar(labels,100*preds_df['score'])
plt.title(f'tweet')
plt.ylabel('class probability %')
plt.show()