-
Notifications
You must be signed in to change notification settings - Fork 0
/
srvrest.py
137 lines (104 loc) · 3.61 KB
/
srvrest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import base64
import os
import time
from typing import Union
from dotenv import load_dotenv
from fastapi import FastAPI
from pydantic import BaseModel, validator
from transformers import AutoModelForCausalLM, pipeline
import uvicorn
# init RESTful, load .env
app = FastAPI()
load_dotenv()
# constants
SRVREST_CUDA_DEV = int(os.environ["SRVREST_CUDA_DEV"])
SRVREST_MODEL = int(os.environ["SRVREST_MODEL"])
# global declarations
model_to_load = [
"gpt2",
"EleutherAI/gpt-neo-125M",
"EleutherAI/gpt-neo-1.3B",
"EleutherAI/gpt-neo-2.7B", # too memory intensive for my GTX 1070
"EleutherAI/gpt-j-6B",
][SRVREST_MODEL]
gpt_generator = None
model = None
# init model
@app.on_event("startup")
def load_model():
global gpt_generator, model
start = time.time()
model = AutoModelForCausalLM.from_pretrained(
model_to_load, low_cpu_mem_usage=True
).to("cpu" if SRVREST_CUDA_DEV == -1 else "cuda")
gpt_generator = pipeline(
# CUDA device n to use (-1 is cpu): device=n
"text-generation",
model=model,
tokenizer=model_to_load,
device=SRVREST_CUDA_DEV,
)
wall = time.time() - start
print(f"Model loadtime wall clock: {wall / 60:1.0f}min {wall % 60:3.1f}s")
def evaluate(args):
text = (
args.text.in_64 if isinstance(args.text, Decoded) else args.text.plain
)
start = time.time()
sentences = gpt_generator(
text,
do_sample=args.do_sample,
top_p=args.top_p,
top_k=args.top_k,
temperature=args.temperature,
max_length=args.max_length,
num_return_sequences=args.num_return_sequences,
)
return {
"model": model_to_load,
"s_elapsed": f"{time.time() - start:.2f}",
"sentences": sentences,
}
class Plain(BaseModel):
plain: str
class Decoded(BaseModel):
in_64: str
@validator("in_64")
def _(cls, encoded):
try:
return base64.b64decode(encoded).decode()
except Exception as e:
raise ValueError(f"Bad b64 encoding - {e}")
class Input(BaseModel):
"""
Partial copy from Docs » Module code » transformers.generation_utils
def generate(...) in
https://huggingface.co/transformers/v4.2.0/_modules/transformers/generation_utils.html
do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to use sampling ; use greedy decoding otherwise.
top_p (:obj:`float`, `optional`, defaults to 1.0):
If set to float < 1, only the most probable tokens with probabili-
ties that add up to :obj:`top_p` or higher are kept for generation.
top_k (:obj:`int`, `optional`, defaults to 50):
The number of highest probability vocabulary tokens to keep
for top-k-filtering.
temperature (:obj:`float`, `optional`, defaults tp 1.0):
The value used to module the next token probabilities.
max_length (:obj:`int`, `optional`, defaults to 20):
The maximum length of the sequence to be generated.
num_return_sequences(:obj:`int`, `optional`, defaults to 1):
The number of independently computed returned sequences for
each element in the batch.
"""
text: Union[Plain, Decoded]
do_sample: bool = False
top_p: int = 100 # Seems to be of type int these days.
top_k: int = 50
temperature: float = 1.0
max_length: int = 20
num_return_sequences: int = 1
@app.post("/generate")
async def generate(args: Input):
return evaluate(args)
if __name__ == "__main__":
uvicorn.run("srvrest:app", host="0.0.0.0", port=49151)