<a href="https://colab.research.google.com/github/pythonuzgit/elmurodov/blob/master/Natural%20Language%20Processing%20with%20PyTorch/Sap_press_analysis_with_Hugging_Face_using_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import transformers
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
#from ignite.metrics import Accuracy, Precision, Recall, Fbeta

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from collections import defaultdict

import torch
import pandas as pd
from tqdm.notebook import trange, tqdm


In [None]:
df = pd.read_csv('/content/sap_press.csv')
df.head()

Unnamed: 0,headline,label
0,SAP Leads the Way to Industry Cloud,Story
1,Oxford Economics and SAP Survey Results Reveal...,Partnership
2,SAP Named a Leader Among Digital Experience Pl...,Award
3,SAP Recognized for the Sixth Consecutive Year ...,Award
4,OpenPeak and SAP Announce Plans to Maximize Mo...,Partnership


In [None]:
df.isnull().sum()

headline    0
label       0
dtype: int64

In [None]:
df.label.value_counts()

Partnership      475
Award            337
Story            276
Financials       164
Solution          82
Merger/Invest     67
People            41
Name: label, dtype: int64

In [None]:
df = df[df.label.isin(['Partnership', 'Award', 'Story', 'Financials', 'Solution', 'Merger/Invest',
                           'People'])]

In [None]:
possible_labels = df.label.unique()

In [None]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [None]:
label_dict

{'Award': 2,
 'Financials': 5,
 'Merger/Invest': 3,
 'Partnership': 1,
 'People': 4,
 'Solution': 6,
 'Story': 0}

In [None]:
df.label = df['label'].map(label_dict)

In [None]:
df.head()

Unnamed: 0,headline,label
0,SAP Leads the Way to Industry Cloud,0
1,Oxford Economics and SAP Survey Results Reveal...,1
2,SAP Named a Leader Among Digital Experience Pl...,2
3,SAP Recognized for the Sixth Consecutive Year ...,2
4,OpenPeak and SAP Announce Plans to Maximize Mo...,1


In [None]:
def process_headline(headline):
  """Process tweet function.
  Input:
      tweet: a string containing a tweet
  Output:
      tweets_clean: a list of words containing the processed tweet

  """
  # remove old style retweet text "RT"
  headline = re.sub(r'^RT[\s]+', '', headline)
  # remove hyperlinks
  headline = re.sub(r'https?:\/\/.*[\r\n]*', '', headline)
  headline = re.sub(r'#', '', headline)
  # removing hyphens
  headline = re.sub('-', ' ', headline)
  # remove linebreaks
  headline = re.sub('<br\s?\/>|<br>', "", headline)
  # remving numbers
  headline = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b",'', headline)

  # tokenize tweets
  tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True,
                              reduce_len=True)
  tweet_tokens = tokenizer.tokenize(headline)

  # remove numbers
  tweet_tokens = [i for i in tweet_tokens if not i.isdigit()]

  tweets_clean = []
  for word in tweet_tokens:
    tweets_clean.append(word)

  return ' '.join(tweets_clean)

In [None]:
df['headline'] = df['headline'].apply(process_headline)
df.head()

Unnamed: 0,headline,label
0,SAP Leads the Way to Industry Cloud,0
1,Oxford Economics and SAP Survey Results Reveal...,1
2,SAP Named a Leader Among Digital Experience Pl...,2
3,SAP Recognized for the Sixth Consecutive Year ...,2
4,OpenPeak and SAP Announce Plans to Maximize Mo...,1


Training and validation split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.label.values)
print(X_train.shape)

(1225,)


In [None]:
df['data_type'] = ['not_set']*df.shape[0]

In [None]:
df.head()

Unnamed: 0,headline,label,data_type
0,SAP Leads the Way to Industry Cloud,0,not_set
1,Oxford Economics and SAP Survey Results Reveal...,1,not_set
2,SAP Named a Leader Among Digital Experience Pl...,2,not_set
3,SAP Recognized for the Sixth Consecutive Year ...,2,not_set
4,OpenPeak and SAP Announce Plans to Maximize Mo...,1,not_set


In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,headline
label,data_type,Unnamed: 2_level_1
0,train,234
0,val,42
1,train,404
1,val,71
2,train,286
2,val,51
3,train,57
3,val,10
4,train,35
4,val,6


Loading Tokenizer and Encoding 

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'].headline.values,
    add_special_tokens = True,
    return_attention_mask = True,
    #pad_to_max_length = True,
    padding = True,
    max_length = 256,
    return_tensors = 'pt',
    truncation = True
)

In [None]:
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type == 'val'].headline.values,
    add_special_tokens = True,
    return_attention_mask = True,
    #pad_to_max_length = True,
    padding = True,
    max_length = 256,
    return_tensors = 'pt',
    truncation = True
)

In [None]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)


In [None]:
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
len(dataset_train)

1225

In [None]:
dataset_val.tensors

(tensor([[  101, 20066,  2315,  ...,     0,     0,     0],
         [  101, 20066, 17472,  ...,     0,     0,     0],
         [  101,  9980,  1998,  ...,     0,     0,     0],
         ...,
         [  101, 20066,  9297,  ...,     0,     0,     0],
         [  101, 20066,  3640,  ...,     0,     0,     0],
         [  101, 20066, 24545,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([2, 5, 1, 1, 4, 5, 5, 3, 1, 1, 1, 0, 5, 1, 1, 0, 1, 0, 5, 1, 2, 1, 2, 1,
         1, 4, 2, 3, 1, 2, 3, 5, 5, 2, 5, 0, 2, 0, 6, 2, 2, 2, 1, 1, 2, 0, 1, 0,
         2, 1, 0, 6, 2, 4, 6, 1, 2, 1, 5, 2, 0, 1, 1, 2, 3, 1, 2, 1, 1, 2, 0, 0,
         5, 1, 6, 1, 1, 2, 3, 0, 5, 5, 0, 3, 6, 2, 1, 1, 1, 0, 0, 2, 4, 6, 1, 2,
         3, 0, 1, 6, 1, 1, 0, 2, 1, 5, 0, 1, 2, 1, 2, 1, 2, 1, 0, 0, 2, 1, 5, 1,

Setting up Bert pretrained model

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False 
)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Creating data loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

Setting up optimizer and scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)

In [None]:
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

Defining our Performance metrics

In [None]:
import numpy as np
from sklearn.metrics import f1_score

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis =1 ).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')


In [None]:
def accuracy_per_class(preds, labels):
  label_dict_inverse = {v: k for k, v in label_dict.items()}

  preds_flat = np.argmax(preds, axis=1).flatten()

  preds_flat = np.argmax(preds, axis = 1).flatten()
  labels_flat = labels.flatten()
  #labels_flat = labels.flatten()

  for label in np.unique(labels_flat):
    y_pred = preds_flat[labels_flat == label]
    y_true = labels_flat[labels_flat == label]

    print(f'Class : {label_dict_inverse[label]}')
    print(f'Accuracy:{len(y_pred[y_pred == label])}/{len(y_true)}\n')
    

Creating training loop

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [None]:
def evaluate(dataloader_val):


  model.eval()

  loss_val_total = 0
  predictions, true_vals = [], []

  for batch in tqdm(dataloader_val):
    batch = tuple(b.to(device) for b in batch)

    inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

    with torch.no_grad():
      outputs = model(**inputs)   



    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()
    
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    predictions.append(logits)
    true_vals.append(label_ids)

  loss_val_avg = loss_val_total/len(dataloader_val) 
  predictions = np.concatenate(predictions, axis = 0)

  true_vals = np.concatenate(true_vals, axis= 0)   
  return loss_val_avg, predictions, true_vals


In [None]:
for epoch in tqdm(range(1, epochs+1)):
  
  model.train()
  loss_train_total = 0

  progress_bar = tqdm(dataloader_train,
                      desc = 'Epoch {:1d}'. format(epoch), leave = False, disable = False)
  
  for batch in progress_bar:
    model.zero_grad()
    batch = tuple(b.to(device) for b in batch)
    inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

    outputs = model(**inputs)
    loss = outputs[0]
    loss_train_total += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    #torch.nn.utils.norm_(model.parameters(), 1.0)

    optimizer.step()
    scheduler.step()

    progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


  tqdm.write('\nEpoch {epoch}')

  loss_train_avg = loss_train_total /len(dataloader_train)
  tqdm.write(f'Training loss : {loss_train_avg}')

  val_loss, predictions, true_vals = evaluate(dataloader_val)
  val_f1 = f1_score_func(predictions, true_vals)
  tqdm.write(f'Validation loss: {val_loss}')
  tqdm.write(f'F1 Score (weighted): {val_f1}')

       

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/307 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 1.1942556752832394


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.7406394396509443
F1 Score (weighted): 0.7057881146077715


Epoch 2:   0%|          | 0/307 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.55955999647343


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.5517284912722451
F1 Score (weighted): 0.782889263468592


Epoch 3:   0%|          | 0/307 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.33483301808129307


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.5067713675754411
F1 Score (weighted): 0.8860886051924293


Epoch 4:   0%|          | 0/307 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.191433184041493


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.6952243319579533
F1 Score (weighted): 0.8547468863475914


Epoch 5:   0%|          | 0/307 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.10717392948771605


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.7137945124081203
F1 Score (weighted): 0.86927873713588


Epoch 6:   0%|          | 0/307 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.06194072764014965


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.7442159865583692
F1 Score (weighted): 0.8794051846501519


Epoch 7:   0%|          | 0/307 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.049009046292445145


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.8456384880202157
F1 Score (weighted): 0.863753521520924


Epoch 8:   0%|          | 0/307 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.029496368712060007


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.8048732834202903
F1 Score (weighted): 0.873696880773039


Epoch 9:   0%|          | 0/307 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.02612674525174455


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.866913378238678
F1 Score (weighted): 0.8591717188294028


Epoch 10:   0%|          | 0/307 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.02090298613466638


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.8238621524402073
F1 Score (weighted): 0.8692042596420476


Evaluation our model

In [None]:
accuracy_per_class(predictions, true_vals)

Class : Story
Accuracy:24/42

Class : Partnership
Accuracy:68/71

Class : Award
Accuracy:51/51

Class : Merger/Invest
Accuracy:8/10

Class : People
Accuracy:6/6

Class : Financials
Accuracy:23/25

Class : Solution
Accuracy:10/12

