In [2]:
import argparse
import cherrypy
import json
import requests
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

In [6]:
import cherrypy
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

class ResponseServer:

    # Initializing the model mentioned if not already on the GPU.
    # Clear the GPU if model shown is not already on the GPU.
    def __init__(self):
        
        # note, if the same mod
        model_identifier = "tiiuae/falcon-rw-1b"
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_identifier)
        
        # the first time you call from_pretrained(). Hugging Face's Transformers library checks if the model is already in the cache.
        # Note, Jupyter uses different kernels for different notebooks, so transformers may not be able to check the cache.
        self.model = AutoModelForCausalLM.from_pretrained(
            model_identifier, 
            device_map="auto", # map device to GPU, then CPU, then RAM
            torch_dtype=torch.bfloat16, # use half precision rather than 32bit
            trust_remote_code=True
        )

    @cherrypy.expose
    @cherrypy.tools.json_out()
    @cherrypy.tools.json_in()
    def api(self):
        try:
            data = cherrypy.request.json
            prompt = data.get("prompt")
            response_text = self.generate_response(prompt)
            return {"response": response_text}
        except Exception as e:
            cherrypy.log(f"Error: {str(e)}", severity=40)
            cherrypy.response.status = 500
            return {"message": "Internal Server Error"}

    def generate_response(self, prompt):
        inputs = self.tokenizer.encode(
            prompt, 
            return_tensors='pt',
            add_special_tokens=True
        )
        
        # set attention mask to improve the reliability of the results
        # attention_mask = self.tokenizer.get_attention_mask_for_batch(
        #    [inputs],
        #    # fill the inputs to a uniform length when batching sequences together.
        #    # makes compatible with models that expect inputs of a consistent length
        #    self.model.config.pad_token_id
        #)
        
        # Utilize GPU, otherwise failback error.
        if not torch.cuda.is_available():
            raise RuntimeError("CUDA is not available. Please check your GPU settings.")
            self.model.to("cuda")
        
        # Using model.generate as opposed to transformers.pipeline() for more fine-grained control        
        outputs = self.model.generate(
            inputs,
            max_length=300,
            do_sample=True,  # Enables sampling, essential for top_k and top_p to work
            top_k=50,        # The number of highest probability vocabulary tokens to keep for top-k-filtering
            top_p=0.95,      # Nucleus sampling: keep the top tokens with cumulative probability >= top_p
            temperature=0.7  # Controls randomness. Lower value means less random
        )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
if __name__ == '__main__':
    cherrypy.config.update({'server.socket_port': 8080})
    cherrypy.quickstart(ResponseServer())

[30/Nov/2023:20:06:39] ENGINE Listening for SIGTERM.
[30/Nov/2023:20:06:40] ENGINE Listening for SIGHUP.
[30/Nov/2023:20:06:40] ENGINE Listening for SIGUSR1.
[30/Nov/2023:20:06:40] ENGINE Bus STARTING
[30/Nov/2023:20:06:40] ENGINE Started monitor thread 'Autoreloader'.
[30/Nov/2023:20:06:40] ENGINE Serving on http://127.0.0.1:8080
[30/Nov/2023:20:06:40] ENGINE Bus STARTED
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
2023-11-30 20:09:32.345426: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


127.0.0.1 - - [30/Nov/2023:20:11:13] "POST /api HTTP/1.1" 200 1326 "" "curl/7.81.0"


[30/Nov/2023:21:28:37] ENGINE Keyboard Interrupt: shutting down bus
[30/Nov/2023:21:28:37] ENGINE Bus STOPPING
[30/Nov/2023:21:28:37] ENGINE HTTP Server cherrypy._cpwsgi_server.CPWSGIServer(('127.0.0.1', 8080)) shut down
[30/Nov/2023:21:28:37] ENGINE Stopped thread 'Autoreloader'.
[30/Nov/2023:21:28:37] ENGINE Bus STOPPED
[30/Nov/2023:21:28:37] ENGINE Bus EXITING
[30/Nov/2023:21:28:37] ENGINE Bus EXITED
[30/Nov/2023:21:28:37] ENGINE Waiting for child threads to terminate...


In [11]:
import sys
sys.path.append("./utils/")
from clear_gpu import clear_gpu_memory

clear_gpu_memory()