## Installing Required Tools

In [None]:
!pip install datasets
!pip install transformers --upgrade




## Mounting drive, where model and datasets are placed


In [None]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Loading the NepBERTa Model**

Since the model is not in json format itself, we need to have certain files:

1. **config.json**: This file contains the model configuration.
2. **vocab.txt**: The vocabulary file containing the mapping between tokens and their IDs.
3. **tf_model.h5**: The model's weights saved in either TensorFlow (.h5)

// if from pytorch, the model may be pytorch_model.bin

we set the path of model to the directory containing all three files so that model can be read using ```TFAutoModelForSequenceClassification.from_pretrained()```

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Use a pipeline as a high-level helper
from transformers import pipeline

# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NepBERTa/NepBERTa")
model = AutoModelForSequenceClassification.from_pretrained("NepBERTa/NepBERTa",from_tf=True).to(device)

classifier = pipeline("sentiment-analysis", model=model, tokenizer = tokenizer)

# model_dir = "/content/drive/My Drive/NepBERTa/model/"
# model = TFAutoModelForSequenceClassification.from_pretrained(model_dir)
# tokenizer = BertTokenizer.from_pretrained(model_dir)

cuda


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
import pandas as pd

df_train = pd.read_csv("/content/drive/My Drive/NepBERTa/train.csv")
df_test = pd.read_csv("/content/drive/My Drive/NepBERTa/test.csv")
main_path = "/content/drive/My Drive/NepBERTa/"

In [None]:
df_train

Unnamed: 0,text,label
0,चीनले दक्षिण एसियाली मुलुकहरु पाकिस्तान अफगानि...,2
1,हजुर यो कुरा देश सबै जनता ले सुननु परछ अब हुन...,1
2,अब भने कोभिड समस्या हल हुने भयो नेपाल सरकार को...,1
3,रबि जि तपाईं ले गरेको काम देखदा जो कोहिलाइ पनि...,1
4,पुरुष भनेर ठाडो शिर बनाएर हिड्नु नि गार्हो,0
...,...,...
35014,वायु प्रदूषणले संक्रमणको थप जोखिम कोभिड बाट मृ...,0
35015,गण्डकी प्रदेशका प्रहरी प्रमुख डीआइजीमा कोभिड स...,0
35016,"आउलास भोट माग्न भ्रस्टचारी , 34:31 बाट भ्रस्टच...",2
35017,हाम्रो नेपाली समाजमा /संबिधानमा यस्तो कानुन आउ...,2


In [None]:
df_test


Unnamed: 0,text,label
0,अपाङ्गता भएका बालबालिकालाई कोभिड पछि वैकल्पिक ...,0
1,सूरदास के इस भजन में विश्व के बर्तमान स्थिति ज...,0
2,हन चसमा कहिले पनि देखेको,2
3,सन्दर्भ कोभिड् कोरोना,1
4,नेपाल प्रहरीको मुख्यालय प्रहरी प्रधान कार्यालय...,0
...,...,...
8750,ईद उल फितर पर्वले शान्ति र एकता कायम गर्दै कोभ...,1
8751,कोभिड बारे सूचना सामग्री समेटिएको पुस्तक उद्यो...,2
8752,हिन्दु महिलाहरूको महान चाड हरितालिका तीजको उपल...,1
8753,भान्सा कि भान्छा?,2


We can see that the training dataset has 6000 instances and test dataset has 1996 instances of data. It has label column that describes the sentiment of the given text data.

label:
0. Negative
1. Positive
2. Neutral


In [None]:
# We have another dataset from kaggle, let's load the data and look into it.

# df_kag = pd.read_csv("/content/drive/My Drive/NepBERTa/mergeData.csv")
# df_kag


In [None]:
df_train.dtypes, df_test.dtypes

(text     object
 label     int64
 dtype: object,
 text     object
 label     int64
 dtype: object)

In [None]:
# Before mapping to integer format, let's check the types of label values.
df_train['label'].value_counts()


1    15333
0    13889
2     5797
Name: label, dtype: int64

In [None]:
df_test['label'].value_counts()


1    3813
0    3506
2    1436
Name: label, dtype: int64

We need to remove the random and incorrect labels and map to integer format.

In [None]:
# For Train
df_train.drop(df_train[df_train['label'] == '-'].index, inplace = True)
df_train.drop(df_train[df_train['label'] == '20'].index, inplace = True)
df_train.drop(df_train[df_train['label'] == '11'].index, inplace = True)
df_train.drop(df_train[df_train['label'] == 'o'].index, inplace = True)
df_train.drop(df_train[df_train['label'] == '--'].index, inplace = True)

# For Test
df_test.drop(df_test[df_test['label'] == '-'].index, inplace = True)
df_test.drop(df_test[df_test['label'] == 'o'].index, inplace = True)

In [None]:
df_train['label'].value_counts()


1    15333
0    13889
2     5797
Name: label, dtype: int64

In [None]:
df_test['label'].value_counts()


1    3813
0    3506
2    1436
Name: label, dtype: int64

In [None]:
# # since the labels are also in object format, let's change them into integers

# label_mapping = {'0':0, '1':1, '2':2}

# df_train['label'] = df_train['label'].map(label_mapping)
# df_test['label'] = df_test['label'].map(label_mapping)

# df_train.dtypes, df_test.dtypes

In [None]:
df_train.isna().sum(), df_test.isna().sum()

(text     2
 label    0
 dtype: int64,
 text     0
 label    0
 dtype: int64)

In [None]:
df_train = df_train.dropna()
df_test = df_test.dropna()
df_train.isna().sum(), df_test.isna().sum()

(text     0
 label    0
 dtype: int64,
 text     0
 label    0
 dtype: int64)

In [None]:
df_train.head()

Unnamed: 0,text,label
0,चीनले दक्षिण एसियाली मुलुकहरु पाकिस्तान अफगानि...,2
1,हजुर यो कुरा देश सबै जनता ले सुननु परछ अब हुन...,1
2,अब भने कोभिड समस्या हल हुने भयो नेपाल सरकार को...,1
3,रबि जि तपाईं ले गरेको काम देखदा जो कोहिलाइ पनि...,1
4,पुरुष भनेर ठाडो शिर बनाएर हिड्नु नि गार्हो,0


In [None]:
df_test.sample(5)

Unnamed: 0,text,label
1734,कोभिड र राहत,1
7504,अझ नियन्त्रण बाहिर कोभिड,0
2163,कोरोना भाइरस समुदायमा कोभिड सङ्क्रमण नभएको न्य...,1
2868,लकडाउन कोभिड अनलाइन भिडियो पुम्से च्याम्पियनसि...,2
3492,नेपालमा कोभिड का संक्रमितको संख्या पुगेको छ शन...,0


In [None]:
from datasets import Dataset, DatasetDict

df_trainset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

final_dataset = DatasetDict({
    'train':df_trainset,
    'test': test_dataset,
})

In [None]:
def tokenize_texts(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')


In [None]:
# inputs_train = tokenize_texts(df_train['text'].tolist(), tokenizer, max_length=128)
# labels_train = df_train['label'].tolist()

encoded_data = final_dataset.map(tokenize_texts, batched=True, batch_size=None)

Map:   0%|          | 0/35017 [00:00<?, ? examples/s]

Map:   0%|          | 0/8755 [00:00<?, ? examples/s]

In [None]:
encoded_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 35017
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8755
    })
})

In [None]:
def extract_hidden_states(batch):
  inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    # Extract last hidden states
  with torch.no_grad():
      last_hidden_state = model(**inputs).logits
          # Return vector for [CLS] token
      return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
encoded_data.set_format("torch",columns=['input_ids', "attention_mask", "label"])

In [None]:
text = "असाध्यै राम्रो कार्यक्रम आयोजना गरिएको छ."

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt").to(device)

# Forward pass through the model
outputs = model(**inputs)

# Access the last hidden states
last_hidden_states = outputs.logits

In [None]:
encoded_data = encoded_data.map(extract_hidden_states, batched=True)


Map:   0%|          | 0/35017 [00:00<?, ? examples/s]

Map:   0%|          | 0/8755 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification
num_labels = 3
model = (AutoModelForSequenceClassification
         .from_pretrained("NepBERTa/NepBERTa", num_labels=num_labels, from_tf=True)
         .to(device))

All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


In [None]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
# %pip install "accelerate>=0.16.0,<1" "transformers[torch]>=4.28.1,<5" "torch>=1.13.1,<2"
# !pip install transformers[torch]
# !pip install accelerate -U

In [None]:
from transformers import Trainer, TrainingArguments
batch_size = 64
logging_steps = len(encoded_data["train"])
model_name = main_path+"/model"
training_args = TrainingArguments(output_dir='/content/drive/MyDrive',
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  log_level="error")

In [None]:
trainer = Trainer(model=model, args=training_args,
                      compute_metrics=compute_metrics,train_dataset=encoded_data["train"],
                      eval_dataset=encoded_data["test"],tokenizer=tokenizer)
trainer.train();



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.629151,0.74072,0.730498
2,No log,0.657732,0.741291,0.716607
3,No log,0.641455,0.757396,0.751393
4,No log,0.682866,0.756596,0.752537
5,No log,0.716106,0.75831,0.754023


In [None]:
preds_output = trainer.predict(encoded_data["test"])

In [None]:
preds_output.metrics

{'test_loss': 0.7161063551902771,
 'test_accuracy': 0.7583095374071959,
 'test_f1': 0.7540227106876441,
 'test_runtime': 64.1069,
 'test_samples_per_second': 136.569,
 'test_steps_per_second': 2.137}

In [None]:
trainer.save_model("/content/final")

In [None]:
custom_text = "असाध्यै राम्रो कार्यक्रम आयोजना गरिएको छ"
pipe = pipeline("text-classification", model='/content/final')
preds = pipe(custom_text)

In [None]:
preds

[{'label': 'LABEL_1', 'score': 0.984423816204071}]

## Using other models


In [None]:
# import nltk
# import pandas as pd
# import numpy as np
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# from sklearn.model_selection import train_test_split
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, TensorDataset
# from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# stop_words = set(stopwords.words('nepali'))
# lemmatizer = WordNetLemmatizer()

In [None]:
# data = pd.concat([df_train, df_test], ignore_index=True)

In [None]:
# def preprocess_text(text):
#     tokens = word_tokenize(text)
#     tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stop_words]
#     return ' '.join(tokens)

# data['text'] = data['text'].apply(preprocess_text)

In [None]:
# tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
# tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'])

# X = tfidf_matrix.toarray()
# y = data['label'].values

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
# class SentimentLSTM(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers, output_size):
#         super(SentimentLSTM, self).__init__()
#         self.embedding = nn.Embedding(input_size, hidden_size)
#         self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True, bidirectional=False)
#         self.fc = nn.Linear(hidden_size, output_size)  # hidden_size*2 for bidirectional

#     def forward(self, x):
#         x = self.embedding(x)
#         out, _ = self.lstm(x)
#         out = self.fc(out[:, -1, :])  # Take the last time step's output
#         return out

# input_size = X_train.shape[1]
# hidden_size = 128
# num_layers = 2
# output_size = 1

# model = SentimentLSTM(input_size, hidden_size, num_layers, output_size)
# criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy with logits
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# num_epochs = 10
# for epoch in range(num_epochs):
#     model.train()
#     for inputs, labels in train_loader:
#         optimizer.zero_grad()

#         # Convert inputs to LongTensor (integer data type) for the Embedding layer
#         inputs = inputs.long()

#         outputs = model(inputs)
#         loss = criterion(outputs.squeeze(), labels)
#         loss.backward()
#         optimizer.step()
#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
