## Load Data

#### Load Data with Hugging Face Datasets Library

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

* input_ids, attention_mask, label -> numbers

In [4]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [5]:
label2id = {'negative': 0, 'positive': 1}
id2label = {0:'negative', 1:'positive'}

dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [6]:
dataset['train'][0]

{'review': 'Robert Taylor as the mad buffalo hunter Charlie Gilson is the main character in this film. At the beginning I was thinking that Charlie would end up redeeming himself like John Wayne in The Searchers or James Stewart in The Naked Spur. But as the film goes along Gilson keeps doing more atrocities until you realize there is no hope for him. Stewart Granger is Sandy McKenzie, who wants to stop hunting because he realizes that the buffaloes will soon be gone and he becomes disgusted by the act of killing. Gilson is a natural killer who makes no distinction between animals or human beings. Debra Paget as the Indian girl is a surprising character considering the self imposed censorship of that time. She lies with Gilson in total resignation even though she hates him. The last scene of a frozen Gilson, is unforgettable.',
 'sentiment': 'positive',
 'label': 1}

## Data Tokenization

In [7]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [8]:
tokenizer(dataset['train'][0]['review'])

def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [9]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

### Building Model Evaluation Functions
https://huggingface.co/docs/transformers/v4.42.0/en/tasks/sequence_classification#evaluate

In [10]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Model Building

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [13]:
trainer.train()

  0%|          | 0/3282 [00:00<?, ?it/s]

{'loss': 0.457, 'grad_norm': 18.44226837158203, 'learning_rate': 1.695307739183425e-05, 'epoch': 0.46}
{'loss': 0.3493, 'grad_norm': 15.102002143859863, 'learning_rate': 1.3906154783668494e-05, 'epoch': 0.91}


  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.31516027450561523, 'eval_accuracy': 0.867, 'eval_runtime': 136.9405, 'eval_samples_per_second': 109.537, 'eval_steps_per_second': 3.425, 'epoch': 1.0}
{'loss': 0.3104, 'grad_norm': 7.930070400238037, 'learning_rate': 1.0859232175502743e-05, 'epoch': 1.37}
{'loss': 0.2933, 'grad_norm': 11.948954582214355, 'learning_rate': 7.81230956733699e-06, 'epoch': 1.83}


  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.2938709557056427, 'eval_accuracy': 0.8782666666666666, 'eval_runtime': 467.6259, 'eval_samples_per_second': 32.077, 'eval_steps_per_second': 1.003, 'epoch': 2.0}
{'loss': 0.2638, 'grad_norm': 13.99096393585205, 'learning_rate': 4.765386959171238e-06, 'epoch': 2.29}
{'loss': 0.2588, 'grad_norm': 8.726390838623047, 'learning_rate': 1.7184643510054846e-06, 'epoch': 2.74}


  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.292848140001297, 'eval_accuracy': 0.8807333333333334, 'eval_runtime': 254.0921, 'eval_samples_per_second': 59.034, 'eval_steps_per_second': 1.846, 'epoch': 3.0}
{'train_runtime': 6916.4009, 'train_samples_per_second': 15.181, 'train_steps_per_second': 0.475, 'train_loss': 0.31611031457318683, 'epoch': 3.0}


TrainOutput(global_step=3282, training_loss=0.31611031457318683, metrics={'train_runtime': 6916.4009, 'train_samples_per_second': 15.181, 'train_steps_per_second': 0.475, 'total_flos': 882184338000000.0, 'train_loss': 0.31611031457318683, 'epoch': 3.0})

In [14]:
trainer.evaluate()

  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.292848140001297,
 'eval_accuracy': 0.8807333333333334,
 'eval_runtime': 458.7213,
 'eval_samples_per_second': 32.7,
 'eval_steps_per_second': 1.022,
 'epoch': 3.0}

## Model Save and Load for Inference

In [15]:
trainer.save_model('tinybert-sentiment-analysis')

In [16]:
data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the movie is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [17]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-sentiment-analysis', device=device)

classifier(data)

Device set to use cpu


[{'label': 'negative', 'score': 0.9893927574157715},
 {'label': 'negative', 'score': 0.9893345236778259},
 {'label': 'positive', 'score': 0.9902777075767517}]

## List S3 Buckets and Create an S3 Bucket

In [23]:
import boto3
from botocore.exceptions import ClientError


s3 = boto3.client('s3')

bucket_name = 'ridomldeploy'

In [24]:
response = s3.list_buckets()

In [26]:
def create_bucket(bucket_name, region):
    try:
        s3 = boto3.client('s3', region_name=region)
        
        s3.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={'LocationConstraint': region}
        )
        print(f"Bucket '{bucket_name}' basariyla '{region}' bölgesinde olusturuldu.")
    except ClientError as e:
        print(f"Hata: {e}")

In [27]:
bucket_name = "ridomldeploy"
region = "eu-central-1"

In [28]:
create_bucket(bucket_name, region)

Bucket 'ridomldeploy' basariyla 'eu-central-1' bölgesinde olusturuldu.


## Push Model to AWS S3

In [29]:
import boto3

s3 = boto3.client('s3')

bucket_name = 'ridomldeploy'

def create_bucket(bucket_name):
    response = s3.list_buckets()
    buckets = [buck['Name'] for buck in response['Buckets']]
    if bucket_name not in buckets:
        s3.create_bucket(Bucket=bucket_name)
        print("Bucket is created")

    else:
        print("Bucket already exists in your account!!! Feel free to use it.")

create_bucket(bucket_name)

Bucket already exists in your account!!! Feel free to use it.


#### Upload model folder to s3 bucket ml-models/tinybert-sentiment-analysis

In [30]:
import os
import boto3

s3 = boto3.client('s3')
bucket_name = 'ridomldeploy'

def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")
            
            s3.upload_file(file_path, bucket_name, s3_key)


upload_directory('tinybert-sentiment-analysis', 'ml-models/tinybert-sentiment-analysis')
