# Sentiment Analysis with ParsBERT

In [None]:
!nvidia-smi

Fri Jun 24 07:58:35 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Install required packages

!pip install -q transformers
!pip install -q hazm
!pip install -q clean-text[gpl]
!pip install pyyaml==5.4.1

[K     |████████████████████████████████| 4.4 MB 5.3 MB/s 
[K     |████████████████████████████████| 101 kB 11.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 45.7 MB/s 
[K     |████████████████████████████████| 596 kB 65.1 MB/s 
[K     |████████████████████████████████| 316 kB 5.0 MB/s 
[K     |████████████████████████████████| 233 kB 48.6 MB/s 
[K     |████████████████████████████████| 1.4 MB 61.9 MB/s 
[?25h  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 175 kB 5.4 MB/s 
[K     |████████████████████████████████| 53 kB 2.1 MB/s 
[K     |████████████████████████████████| 235 kB 62.5 MB/s 
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyyaml==5.4.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |█

In [None]:
# Import required packages

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

import hazm
from cleantext import clean

import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm

import os
import re
import json
import copy
import collections

  defaults = yaml.load(f)


## Dataset

### Load the data using Pandas

In [None]:
!gdown --id 1YFEJVXIGxuQ1wiWY2gL7uJJ1QEl3SDXS

In [None]:
# from huggingface_hub.hf_api import DatasetInfo
import json
# Opening JSON file
f = open('dataset_annotated_impact.json')
# returns JSON object as
# a dictionary
dataset = json.load(f)

In [None]:
# Train
train_text=[data["text"] for data in dataset["train"] if len(data["annotations"])!=0]
train_label = [data["annotations"] for data in dataset["train"] if len(data["annotations"])!=0]
train_label = list(map(lambda lst:max(set(lst), key=lst.count), train_label))
# Eval
eval_text=[data["text"] for data in dataset["eval"] if len(data["annotations"])!=0]
eval_label = [data["annotations"] for data in dataset["eval"] if len(data["annotations"])!=0]
eval_label = list(map(lambda lst:max(set(lst), key=lst.count), eval_label))
# Test
test_text=[data["text"] for data in dataset["test"] if len(data["annotations"])!=0]
test_label = [data["annotations"] for data in dataset["test"] if len(data["annotations"])!=0]
test_label = list(map(lambda lst:max(set(lst), key=lst.count), test_label))

In [None]:
print(len(train_text), len(train_label))
print(len(eval_text), len(eval_label))
print(len(test_text), len(test_label))

1448 1448
75 75
74 74


In [None]:
train_data = pd.DataFrame({"news": train_text, "label": train_label})
eval_data = pd.DataFrame({"news": eval_text, "label": eval_label})
test_data = pd.DataFrame({"news": test_text, "label": test_label})

In [None]:
# print data information
print('data information')
print(train_data.info(), '\n')

# print missing values information
print('missing values stats')
print(train_data.isnull().sum(), '\n')

# print some missing values
print('some missing values')
print(train_data[train_data['label'].isnull()].iloc[:5], '\n')

data information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1448 entries, 0 to 1447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   news    1448 non-null   object
 1   label   1448 non-null   object
dtypes: object(2)
memory usage: 22.8+ KB
None 

missing values stats
news     0
label    0
dtype: int64 

some missing values
Empty DataFrame
Columns: [news, label]
Index: [] 



In [None]:
types = ["train", "eval", "test"]

In [None]:
for type in types:
  exec(f"{type}_data['label'] = {type}_data['label'].replace('جریان‌ساز', 'impact')")
  exec(f"{type}_data['label'] = {type}_data['label'].replace('غیر جریان‌ساز', 'nonimpact')")

In [None]:
# handle some conflicts with the dataset structure
# you can find a reliable solution, for the sake of the simplicity
# I just remove these bad combinations!
for type in types:
  print(type.upper(), '\n')
  exec(f"{type}_data = {type}_data.drop_duplicates(subset=['news'], keep='first')")
  exec(f"{type}_data = {type}_data.reset_index(drop=True)")


  # previous information after solving the conflicts

  # print data information
  print('data information')
  exec(f"print({type}_data.info())")

  # print missing values information
  print('missing values stats')
  exec(f"print({type}_data.isnull().sum())")

  # print some missing values
  print('some missing values')
  exec(f"print({type}_data[{type}_data['label'].isnull()])")
  print("#"*40)

TRAIN 

data information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1448 entries, 0 to 1447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   news    1448 non-null   object
 1   label   1448 non-null   object
dtypes: object(2)
memory usage: 22.8+ KB
None
missing values stats
news     0
label    0
dtype: int64
some missing values
Empty DataFrame
Columns: [news, label]
Index: []
########################################
EVAL 

data information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   news    75 non-null     object
 1   label   75 non-null     object
dtypes: object(2)
memory usage: 1.3+ KB
None
missing values stats
news     0
label    0
dtype: int64
some missing values
Empty DataFrame
Columns: [news, label]
Index: []
########################################
TEST 

data information
<class '

### Normalization / Preprocessing

In [None]:
for type in types:
  exec(f"{type}_data['news_len_by_words'] = {type}_data['news'].apply(lambda t: len(hazm.word_tokenize(t)))")

In [None]:
for type in types:
  print(type.upper())
  exec(f"min_max_len = {type}_data['news_len_by_words'].min(), {type}_data['news_len_by_words'].max()")
  exec(f"print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')")
  print("#"*40)

TRAIN
Min: 13 	Max: 613
########################################
EVAL
Min: 23 	Max: 281
########################################
TEST
Min: 15 	Max: 232
########################################


In [None]:
def data_gl_than(data, less_than=100.0, greater_than=0.0, col='news_len_by_words'):
    data_length = data[col].values

    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])

    data_glt_rate = (data_glt / len(data_length)) * 100

    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

In [None]:
data_gl_than(train_data, 290, 12)

Texts with word length of greater than 12 and less than 290 includes 98.55% of the whole!


In [None]:
minlim, maxlim = 12, 290

In [None]:
# remove comments
for type in types:
  exec(f"{type}_data['news_len_by_words'] = {type}_data['news_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)")
  exec(f"{type}_data = {type}_data.dropna(subset=['news_len_by_words'])")
  exec(f"{type}_data = {type}_data.reset_index(drop=True)")

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=train_data['news_len_by_words']
))

fig.update_layout(
    title_text='Distribution of word counts within news',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
unique_labels = list(sorted(train_data['label'].unique()))
print(f'We have #{len(unique_labels)}: {unique_labels}')

We have #2: ['impact', 'nonimpact']


In [None]:
fig = go.Figure()

groupby_rate = train_data.groupby('label')['label'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_rate.index)),
    y=groupby_rate.tolist(),
    text=groupby_rate.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of rate within news',
    xaxis_title_text='Rate',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def cleaning(text):
    text = text.strip()

    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)

    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)

    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)

    text = wierd_pattern.sub(r'', text)

    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)

    return text

In [None]:
for type in types:
  # cleaning comments
  exec(f"{type}_data['cleaned_news'] = {type}_data['news'].apply(cleaning)")


  # calculate the length of comments based on their words
  exec(f"{type}_data['cleaned_news_len_by_words'] = {type}_data['cleaned_news'].apply(lambda t: len(hazm.word_tokenize(t)))")

  # remove comments with the length of fewer than three words
  exec(f"{type}_data['cleaned_news_len_by_words'] = {type}_data['cleaned_news_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else len_t)")
  exec(f"{type}_data = {type}_data.dropna(subset=['cleaned_news_len_by_words'])")
  exec(f"{type}_data = {type}_data.reset_index(drop=True)")

train_data.head()

Unnamed: 0,news,label,news_len_by_words,cleaned_news,cleaned_news_len_by_words
0,آتش سوزی کارخانه جمیل نخ گسترده است / اعزام نی...,nonimpact,100.0,آتش سوزی کارخانه جمیل نخ گسترده است / اعزام نی...,100
1,تجهیزات باکیفیت صنعتی آشپزخانه و کافی شاپ\n___...,nonimpact,115.0,تجهیزات باکیفیت صنعتی آشپزخانه و کافی شاپ ____...,107
2,برگزاری جشنواره رسانه ابوذر در دی ماه سال جاری...,impact,124.0,برگزاری جشنواره رسانه ابوذر در دی ماه سال جاری...,126
3,افزایش 80 درصدی فروش سلاح در آمریکا در ژانویه ...,impact,122.0,افزایش ۸۰ درصدی فروش سلاح در آمریکا در ژانویه ...,115
4,بازداشت دو تروریست که در بمب گذاری مسیر زائران...,nonimpact,95.0,بازداشت دو تروریست که در بمب گذاری مسیر زائران...,92


In [None]:
for type in types:
  exec(f"{type}_data = {type}_data[['cleaned_news', 'label']]")
  exec(f"{type}_data.columns = ['news', 'label']")
train_data.head()

Unnamed: 0,news,label
0,آتش سوزی کارخانه جمیل نخ گسترده است / اعزام نی...,nonimpact
1,تجهیزات باکیفیت صنعتی آشپزخانه و کافی شاپ ____...,nonimpact
2,برگزاری جشنواره رسانه ابوذر در دی ماه سال جاری...,impact
3,افزایش ۸۰ درصدی فروش سلاح در آمریکا در ژانویه ...,impact
4,بازداشت دو تروریست که در بمب گذاری مسیر زائران...,nonimpact


In [None]:
print(f'We have #{len(train_data["label"].unique())} labels: {train_data["label"].unique()}')

We have #2 labels: ['nonimpact' 'impact']


### Handling Unbalanced Data

In [None]:
fig = go.Figure()

groupby_label = train_data.groupby('label')['label'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_label.index)),
    y=groupby_label.tolist(),
    text=groupby_label.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of label within news [TRAIN DATA]',
    xaxis_title_text='Label',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
nonimpact_data = train_data[train_data['label'] == train_data["label"].unique()[0]]
impact_data = train_data[train_data['label'] == train_data["label"].unique()[1]]

cutting_point = min(len(nonimpact_data), len(impact_data))

if cutting_point <= len(nonimpact_data):
    nonimpact_data = nonimpact_data.sample(n=cutting_point).reset_index(drop=True)

if cutting_point <= len(impact_data):
    impact_data = impact_data.sample(n=cutting_point).reset_index(drop=True)

new_train_data = pd.concat([nonimpact_data, impact_data])
new_train_data = new_train_data.sample(frac=1).reset_index(drop=True)
new_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   news    720 non-null    object
 1   label   720 non-null    object
dtypes: object(2)
memory usage: 11.4+ KB


In [None]:
fig = go.Figure()

groupby_label = new_train_data.groupby('label')['label'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_label.index)),
    y=groupby_label.tolist(),
    text=groupby_label.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of label within news [NEW TRAIN DATA]',
    xaxis_title_text='Label',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
new_train_data.head()

Unnamed: 0,news,label
0,ادامه بلاتکلیفی ۵ میلیارد دلار از پول‌های ایرا...,nonimpact
1,تخصیص ۱۲۳ درصدی اعتبارات سفر رئیس جمهور به ۱۷ ...,impact
2,لیگ تنیس ساحلی| صعود منطقه تیم‌های آزاد انزلی ...,nonimpact
3,جرم امیرکبیر ایستادگی در برابر رانت خواران واب...,impact
4,دومین کنگره ملی شعر و داستان امر به معروف آغاز...,impact


## Train,Validation,Test

In [None]:
labels = list(new_train_data["label"].unique())
new_train_data["label_id"] = new_train_data["label"].apply(lambda t: labels.index(t))
train_data = new_train_data
eval_data["label_id"] = eval_data["label"].apply(lambda t: labels.index(t))
test_data["label_id"] = test_data["label"].apply(lambda t: labels.index(t))

print(train_data.shape)
print(eval_data.shape)
print(test_data.shape)

(720, 3)
(75, 3)
(74, 3)


## PyTorch

In [None]:
from transformers import BertConfig, BertTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

### Configuration

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

device: cuda:0
CUDA is available!  Training on GPU ...


In [None]:
# general config
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 10
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-taaghceh/pytorch_model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
# create a key finder based on label 2 id and id to label

label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'nonimpact': 0, 'impact': 1}
id2label: {0: 'nonimpact', 1: 'impact'}


In [None]:
# setup the tokenizer and configuration

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "nonimpact",
    "1": "impact"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "impact": 1,
    "nonimpact": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



### Input Embeddings

In [None]:
idx = np.random.randint(0, len(train_data))
sample_comment = train_data.iloc[idx]['news']
sample_label = train_data.iloc[idx]['label']

print(f'Sample: \n{sample_comment}\n{sample_label}')

Sample: 
شاخص‌های نظام آزمایشگاهی ارتقاء پیدا کند ____________ دکتر پیمان صالحی در همایش ملی مجازی بزرگداشت زادروز حکیم اسماعیل جرجانی و روز ملی آزمایشگاه که به میزبانی دانشگاه لرستان برگزار شد در سخنانی اهمیت فعالیت‌های آزمایشگاهی و شرکت‌های دانش بنیان و تجاری سازی محصولات را توضیح داد. معاون پژوهشی وزیر علوم، تحقیقات و فناوری، تصریح کرد: بایستی شاخص‌های نظام آزمایشگاهی ارتقاء پیدا کند
nonimpact


In [None]:
tokens = tokenizer.tokenize(sample_comment)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f'  News: {sample_comment}')
print(f'   Tokens: {tokenizer.convert_tokens_to_string(tokens)}')
print(f'Token IDs: {token_ids}')

  News: شاخص‌های نظام آزمایشگاهی ارتقاء پیدا کند ____________ دکتر پیمان صالحی در همایش ملی مجازی بزرگداشت زادروز حکیم اسماعیل جرجانی و روز ملی آزمایشگاه که به میزبانی دانشگاه لرستان برگزار شد در سخنانی اهمیت فعالیت‌های آزمایشگاهی و شرکت‌های دانش بنیان و تجاری سازی محصولات را توضیح داد. معاون پژوهشی وزیر علوم، تحقیقات و فناوری، تصریح کرد: بایستی شاخص‌های نظام آزمایشگاهی ارتقاء پیدا کند
   Tokens: شاخصهای نظام ازمایشگاهی ارتقاء پیدا کند [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] دکتر پیمان صالحی در همایش ملی مجازی بزرگداشت زادروز حکیم اسماعیل جرجانی و روز ملی ازمایشگاه که به میزبانی دانشگاه لرستان برگزار شد در سخنانی اهمیت فعالیتهای ازمایشگاهی و شرکتهای دانش بنیان و تجاری سازی محصولات را توضیح داد . معاون پژوهشی وزیر علوم ، تحقیقات و فناوری ، تصریح کرد : بایستی شاخصهای نظام ازمایشگاهی ارتقاء پیدا کند
Token IDs: [9512, 3861, 12147, 9307, 3510, 3054, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4283, 6207, 11451, 2786, 6971, 3333, 5427, 12965, 33728, 11083, 7799, 52666

In [None]:
encoding = tokenizer.encode_plus(
    sample_comment,
    max_length=32,
    truncation=True,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=True,
    return_attention_mask=True,
    padding='max_length',
    return_tensors='pt',  # Return PyTorch tensors
)

print(f'Keys: {encoding.keys()}\n')
for k in encoding.keys():
    print(f'{k}:\n{encoding[k]}')

Keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

input_ids:
tensor([[    2,  9512,  3861, 12147,  9307,  3510,  3054,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,  4283,
          6207, 11451,  2786,  6971,  3333,  5427, 12965, 33728, 11083,  7799,
         52666,     4]])
token_type_ids:
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])


### Dataset

In [None]:
class NewsDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Taaghche. """

    def __init__(self, tokenizer, news, targets=None, label_list=None, max_len=128):
        self.news = news
        self.targets = targets
        self.has_target = isinstance(targets, list) or isinstance(targets, np.ndarray)

        self.tokenizer = tokenizer
        self.max_len = max_len


        self.label_map = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}

    def __len__(self):
        return len(self.news)

    def __getitem__(self, item):
        new = str(self.news[item])

        if self.has_target:
            target = self.label_map.get(str(self.targets[item]), str(self.targets[item]))

        encoding = self.tokenizer.encode_plus(
            new,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')

        inputs = {
            'news': new,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
        }

        if self.has_target:
            inputs['targets'] = torch.tensor(target, dtype=torch.long)

        return inputs


def create_data_loader(x, y, tokenizer, max_len, batch_size, label_list):
    dataset = NewsDataset(
        news=x,
        targets=y,
        tokenizer=tokenizer,
        max_len=max_len,
        label_list=label_list)

    return torch.utils.data.DataLoader(dataset, batch_size=batch_size)

In [None]:
label_list = labels
train_data_loader = create_data_loader(train_data['news'].to_numpy(), train_data['label'].to_numpy(), tokenizer, MAX_LEN, TRAIN_BATCH_SIZE, label_list)
valid_data_loader = create_data_loader(eval_data['news'].to_numpy(), eval_data['label'].to_numpy(), tokenizer, MAX_LEN, VALID_BATCH_SIZE, label_list)
test_data_loader = create_data_loader(test_data['news'].to_numpy(), None, tokenizer, MAX_LEN, TEST_BATCH_SIZE, label_list)

In [None]:
sample_data = next(iter(train_data_loader))

print(sample_data.keys())

print(sample_data['news'])
print(sample_data['input_ids'].shape)
print(sample_data['input_ids'][0, :])
print(sample_data['attention_mask'].shape)
print(sample_data['attention_mask'][0, :])
print(sample_data['token_type_ids'].shape)
print(sample_data['token_type_ids'][0, :])
print(sample_data['targets'].shape)
print(sample_data['targets'][0])

dict_keys(['news', 'input_ids', 'attention_mask', 'token_type_ids', 'targets'])
['ادامه بلاتکلیفی ۵ میلیارد دلار از پول\u200cهای ایران در عراق ____________ در همین رابطه سید حمید حسینی – عضو هیئت مدیره اتاق بازرگانی مشترک ایران و عراق – با بیان اینکه این موضوع خبر جدیدی نیست، به ایسنا توضیح داد: از چند ماه قبل بحث استفاده از منابع ایران در بانک tbi عراق مطرح شده بود و حتی بخشی از سهمیه واکسن ایران در کواکس نیز از این طریق نهایی و واکسن به کشور وارد شد اما در عمل برای استفاده گسترده\u200cتر از این منابع مشکلاتی وجود دارد. به گفته حسینی، اخیرا صحبت هایی صورت گرفته که در صورت به نتیجه رسیدن مذاکرات برجام و لغو تحریم\u200cها امکان آزاد شدن این منابع و دسترسی ایران به این پول\u200cها فراهم شود و به نظر می\u200cرسد راه حل نهایی بیش از آنکه در مذاکرات دو جانبه ایران و عراق تعیین شود، در وین مشخص خواهد شد', 'تخصیص ۱۲۳ درصدی اعتبارات سفر رئیس جمهور به ۱۷ استان کشور ____________ به گزارش ایمنا، صولت مرتضوی با اشاره به تخصیص ۱۲۳ درصدی اعتبارات سفر رئیس جمهور به ۱۷ استان کشور، تصریح کرد: اساس کار 

In [None]:
sample_test = next(iter(test_data_loader))
print(sample_test.keys())

dict_keys(['news', 'input_ids', 'attention_mask', 'token_type_ids'])


In [None]:
class SentimentModel(nn.Module):

    def __init__(self, config):
        super(SentimentModel, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_NAME_OR_PATH)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        temp = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)
        pooled_output = temp[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()
pt_model = None

!nvidia-smi

Sun Jun 19 11:38:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    34W /  70W |   6774MiB / 15109MiB |     31%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
pt_model = SentimentModel(config=config)
pt_model = pt_model.to(device)

try:
  print('pt_model', type(pt_model))
except:
  del type
  print('pt_model', type(pt_model))

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


pt_model <class '__main__.SentimentModel'>


In [None]:
# sample data output

sample_data_news = sample_data['news']
sample_data_input_ids = sample_data['input_ids']
sample_data_attention_mask = sample_data['attention_mask']
sample_data_token_type_ids = sample_data['token_type_ids']
sample_data_targets = sample_data['targets']

# available for using in GPU
sample_data_input_ids = sample_data_input_ids.to(device)
sample_data_attention_mask = sample_data_attention_mask.to(device)
sample_data_token_type_ids = sample_data_token_type_ids.to(device)
sample_data_targets = sample_data_targets.to(device)


# outputs = F.softmax(
#     pt_model(sample_data_input_ids, sample_data_attention_mask, sample_data_token_type_ids),
#     dim=1)

outputs = pt_model(sample_data_input_ids, sample_data_attention_mask, sample_data_token_type_ids)
_, preds = torch.max(outputs, dim=1)

print(outputs[:5, :])
print(preds[:5])

tensor([[-0.6578, -0.1789],
        [-0.3224, -0.4150],
        [-0.2027, -0.0670],
        [-0.5855, -0.1393],
        [-0.2952, -0.3138]], device='cuda:0', grad_fn=<SliceBackward0>)
tensor([1, 0, 1, 1, 0], device='cuda:0')


### Training

In [None]:
def simple_accuracy(y_true, y_pred):
    return (y_true == y_pred).mean()

def acc_and_f1(y_true, y_pred, average='weighted'):
    acc = simple_accuracy(y_true, y_pred)
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average=average)
    return {
        "acc": acc,
        "f1": f1,
    }

def y_loss(y_true, y_pred, losses):
    y_true = torch.stack(y_true).cpu().detach().numpy()
    y_pred = torch.stack(y_pred).cpu().detach().numpy()
    y = [y_true, y_pred]
    loss = np.mean(losses)

    return y, loss


def eval_op(model, data_loader, loss_fn):
    model.eval()

    losses = []
    y_pred = []
    y_true = []

    with torch.no_grad():
        for dl in tqdm(data_loader, total=len(data_loader), desc="Evaluation... "):

            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']
            targets = dl['targets']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            targets = targets.to(device)

            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            # calculate the batch loss
            loss = loss_fn(outputs, targets)

            # accumulate all the losses
            losses.append(loss.item())

            y_pred.extend(preds)
            y_true.extend(targets)

    eval_y, eval_loss = y_loss(y_true, y_pred, losses)
    return eval_y, eval_loss


def train_op(model,
             data_loader,
             loss_fn,
             optimizer,
             scheduler,
             step=0,
             print_every_step=100,
             eval=False,
             eval_cb=None,
             eval_loss_min=np.Inf,
             eval_data_loader=None,
             clip=0.0):

    model.train()

    losses = []
    y_pred = []
    y_true = []

    for dl in tqdm(data_loader, total=len(data_loader), desc="Training... "):
        step += 1

        input_ids = dl['input_ids']
        attention_mask = dl['attention_mask']
        token_type_ids = dl['token_type_ids']
        targets = dl['targets']

        # move tensors to GPU if CUDA is available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        targets = targets.to(device)

        # clear the gradients of all optimized variables
        optimizer.zero_grad()

        # compute predicted outputs by passing inputs to the model
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)

        # convert output probabilities to predicted class
        _, preds = torch.max(outputs, dim=1)

        # calculate the batch loss
        loss = loss_fn(outputs, targets)

        # accumulate all the losses
        losses.append(loss.item())

        # compute gradient of the loss with respect to model parameters
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if clip > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)

        # perform optimization step
        optimizer.step()

        # perform scheduler step
        scheduler.step()

        y_pred.extend(preds)
        y_true.extend(targets)

        if eval:
            train_y, train_loss = y_loss(y_true, y_pred, losses)
            train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

            if step % print_every_step == 0:
                eval_y, eval_loss = eval_op(model, eval_data_loader, loss_fn)
                eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

                if hasattr(eval_cb, '__call__'):
                    eval_loss_min = eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min)

    train_y, train_loss = y_loss(y_true, y_pred, losses)

    return train_y, train_loss, step, eval_loss_min

In [None]:
optimizer = AdamW(pt_model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss()

step = 0
eval_loss_min = np.Inf
history = collections.defaultdict(list)


def eval_callback(epoch, epochs, output_path):
    def eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min):
        statement = ''
        statement += 'Epoch: {}/{}...'.format(epoch, epochs)
        statement += 'Step: {}...'.format(step)

        statement += 'Train Loss: {:.6f}...'.format(train_loss)
        statement += 'Train Acc: {:.3f}...'.format(train_score['acc'])

        statement += 'Valid Loss: {:.6f}...'.format(eval_loss)
        statement += 'Valid Acc: {:.3f}...'.format(eval_score['acc'])

        print(statement)

        if eval_loss <= eval_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                eval_loss_min,
                eval_loss))

            torch.save(model.state_dict(), output_path)
            eval_loss_min = eval_loss

        return eval_loss_min


    return eval_cb


for epoch in tqdm(range(1, EPOCHS + 1), desc="Epochs... "):
    train_y, train_loss, step, eval_loss_min = train_op(
        model=pt_model,
        data_loader=train_data_loader,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        step=step,
        print_every_step=EEVERY_EPOCH,
        eval=True,
        eval_cb=eval_callback(epoch, EPOCHS, OUTPUT_PATH),
        eval_loss_min=eval_loss_min,
        eval_data_loader=valid_data_loader,
        clip=CLIP)

    train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

    eval_y, eval_loss = eval_op(
        model=pt_model,
        data_loader=valid_data_loader,
        loss_fn=loss_fn)

    eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

    history['train_acc'].append(train_score['acc'])
    history['train_loss'].append(train_loss)
    history['val_acc'].append(eval_score['acc'])
    history['val_loss'].append(eval_loss)





Epochs... :   0%|          | 0/10 [00:00<?, ?it/s]

Training... :   0%|          | 0/45 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/5 [00:00<?, ?it/s]

Training... :   0%|          | 0/45 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/5 [00:00<?, ?it/s]

Training... :   0%|          | 0/45 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/5 [00:00<?, ?it/s]

Training... :   0%|          | 0/45 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/5 [00:00<?, ?it/s]

Training... :   0%|          | 0/45 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/5 [00:00<?, ?it/s]

Training... :   0%|          | 0/45 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/5 [00:00<?, ?it/s]

Training... :   0%|          | 0/45 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/5 [00:00<?, ?it/s]

Training... :   0%|          | 0/45 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/5 [00:00<?, ?it/s]

Training... :   0%|          | 0/45 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/5 [00:00<?, ?it/s]

Training... :   0%|          | 0/45 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/5 [00:00<?, ?it/s]

### Prediction

In [None]:
def predict(model, news, tokenizer, max_len=128, batch_size=32):
    data_loader = create_data_loader(news, None, tokenizer, max_len, batch_size, None)

    predictions = []
    prediction_probs = []


    model.eval()
    with torch.no_grad():
        for dl in tqdm(data_loader, position=0):
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)

            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(F.softmax(outputs, dim=1))

    predictions = torch.stack(predictions).cpu().detach().numpy()
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()

    return predictions, prediction_probs

In [None]:
test_news = test_data['news'].to_numpy()
preds, probs = predict(pt_model, test_news, tokenizer, max_len=128)

print(preds.shape, probs.shape)

  0%|          | 0/3 [00:00<?, ?it/s]

(74,) (74, 2)


In [None]:
y_test, y_pred = [label_list.index(label) for label in test_data['label'].values], preds

print(f'F1: {f1_score(y_test, y_pred, average="weighted")}')
print()
print(classification_report(y_test, y_pred, target_names=label_list))

F1: 0.6401544401544401

              precision    recall  f1-score   support

   nonimpact       0.80      0.65      0.71        54
      impact       0.37      0.55      0.44        20

    accuracy                           0.62        74
   macro avg       0.58      0.60      0.58        74
weighted avg       0.68      0.62      0.64        74

