In [5]:
import torch
from transformers import pipeline, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
import mlflow
import os
from mlflow.models import infer_signature
from dotenv import load_dotenv
import pandas as pd
# from tokenizer import Thai_tokenizer
import requests
# from langchain_openai import ChatOpenAI
# from langchain_core.messages import HumanMessage

In [41]:
class Typhoon_model:
    def __init__(self, model_name="scb10x/llama-3-typhoon-v1.5-8b-instruct",num_labels=3):
        self.model_name = model_name
        self.num_labels = num_labels
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.pipeline = pipeline(task = "text-generation", model=self.model, tokenizer=self.tokenizer, device=0)
        


    # def make_request(self, message, url, api_key):
    #     client = ChatOpenAI(base_url='https://api.opentyphoon.ai/v1',
    #                         model='typhoon-instruct',
    #                         api_key=api_key)
    #     resp = client.invoke([HumanMessage(content=message)])
    #     print(resp.content)
    

    def tokenize_function(self, examples):
        return self.tokenizer(examples["text"], padding="max_length", truncation=True)

    def predict(self, input_text):
        prediction = self.pipeline(input_text) #[0]
        # print(prediction)
        # if prediction['score'] <= 0.8:
        #     prediction['label'] = 'NEUTRAL'

        # prediction = self.make_request(input_text, self.url, self.api_key)
        return prediction

    def train(self, train_dataset, test_dataset, output_dir="./results", epochs=3, batch_size=16, learning_rate=1e-5):
        # Tokenize datasets
        tokenized_train = train_dataset.map(self.tokenize_function, batched=True)
        tokenized_test = test_dataset.map(self.tokenize_function, batched=True)

        training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=0.01,
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_test,
            tokenizer=self.tokenizer
        )

        trainer.train()

    def evaluate(self, eval_dataset):
        # Tokenize dataset
        tokenized_eval = eval_dataset.map(self.tokenize_function, batched=True)
        
        training_args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="epoch",
            per_device_eval_batch_size=16,
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            eval_dataset=tokenized_eval,
            tokenizer=self.tokenizer
        )

        metrics = trainer.evaluate()
        return metrics

In [42]:

typhoon = Typhoon_model()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.66it/s]


In [69]:
messages=[
        {
            "role": "system",
            "content": 
                """
                 You are a good assistant named Typhoon.
                 Your answer can only be 'good' or 'bad' or 'neutral'.
                 You will choose between three answer based on the sentiment of the input from user.
                 You will also give the confidence score of your answer in the format of 'confident score : [score]'.
                 The confident score will be between 0 and 1. 
                 1 being the most confident and 0 being the least confident.
                """
        },
        {
            "role": "user",
            "content": "#แพทย์จุฬา #คณะแพทย์ #แพทยศาสตร์ #แพทย์อินเตอร์ #จุฬา #cumedi #medical #medicine #chula #medchula #chulalongkorn. Image. 9:51 AM · Jul 8, 2020.",
        }
    ]
print(typhoon.pipeline(messages)[0]['generated_text'][2]['content'])

# You are a good assistant named Typhoon.
#                 Your answer can only be in the format of {'good':[score],'bad':[score],'neutral':[score]}.
#                 You will assign the score of each topic based on the sentiment of the input.
#                 The score will be between 0 and 1. 1 being the most confident and 0 being least confident.

neutral. confident score : 0.75.
