# Activate GPU and Install Dependencies

In [1]:
import warnings,logging

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [2]:
!apt-get install -qq git-lfs

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Preprocess data

In [3]:
import pandas as pd

train_df=pd.read_csv('data/train.csv')
test_df=pd.read_csv('data/test.csv')

len(train_df), len(test_df)

(20000, 3156)

In [4]:
train_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Input,20000.0,18756.0,Zo'r,148.0,,,,,,,
label,20000.0,,,,0.5121,0.499866,0.0,0.0,1.0,1.0,1.0


In [5]:
from transformers import AutoTokenizer
tokz = AutoTokenizer.from_pretrained("rifkat/uztext-3Gb-BPE-Roberta")

In [6]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [7]:
def preprocess_function(examples):
    return tokz(examples["Input"], truncation=True, padding=True)

tokenized_train = train_ds.map(preprocess_function, batched=True, load_from_cache_file=False, remove_columns='Input')
tokenized_test = test_ds.map(preprocess_function, batched=True, load_from_cache_file=False, remove_columns='Input')

columns_to_return = ['input_ids', 'label', 'attention_mask']
tokenized_train.set_format(type='torch', columns=columns_to_return)

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [8]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokz)

# Training the model

In [9]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("rifkat/uztext-3Gb-BPE-Roberta", num_labels=2)

In [10]:
import numpy as np
from datasets import load_metric
 
def compute_metrics(eval_pred):
    load_precision=load_metric('precision')
    load_recall=load_metric('recall')
    load_f1 = load_metric("f1")
    load_accuracy = load_metric("accuracy")
  
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = load_precision.compute(predictions=predictions, references=labels)["precision"]
    recall = load_recall.compute(predictions=predictions, references=labels)["recall"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"Precision": precision, "Recall": recall, "F1": f1,"Accuracy": accuracy}

In [11]:
dataset = tokenized_train.train_test_split(0.10)
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 18000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [14]:
from transformers import TrainingArguments, Trainer
 
repo_name = "uzroberta-sa"
 
training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    fp16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    push_to_hub=True
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=dataset['train'],
   eval_dataset=dataset['test'],
   tokenizer=tokz,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train();

In [None]:
trainer.evaluate()

# Analyzing new data with the model

In [None]:
trainer.push_to_hub()

In [16]:
from transformers import pipeline
 
sentiment_model = pipeline(model="murodbek/uzroberta-sentiment-analysis")
sentiment_model(["Yomon emas", "Daxshat"])

Downloading:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

[{'label': 'LABEL_1', 'score': 0.853135347366333},
 {'label': 'LABEL_1', 'score': 0.9993738532066345}]

In [17]:
test_df['preds'] = -1
for i in range(len(test_df)):
    test_df['preds'][i]=sentiment_model(test_df['Input'][i])[0]['label']

In [18]:
((test_df['preds']=='LABEL_1')==test_df['label']).mean()

0.964828897338403