In [1]:
f = open('./info/Books_5_2017-10-2018-11.txt', 'r')
books = f.readlines()
item_names = [_.split('\t')[0] for _ in books]
item_ids = [_.split('\t')[1][:-1] for _ in books]
item_dict = dict(zip(item_ids, item_names))
# id_mapping = dict(zip(item_ids, range(len(item_ids))))

In [None]:
name = item_names[0]

import sys

import fire
import gradio as gr
import torch
torch.set_num_threads(1)
import transformers
import json
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'
from peft import PeftModel
from transformers import GenerationConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

try:
    if torch.backends.mps.is_available():
        device = "mps"
except:  # noqa: E722
    pass

def main(
    load_8bit: bool = False,
    base_model: str = "Qwen/Qwen1.5-0.5B",
    lora_weights: str = "../../Qwen1.5-0.5B/lora-alpaca",
    test_data_path: str = "./test_128.json",
    result_json_data: str = "./result.json",
    batch_size: int=8,
):
    assert (
        base_model
    ), "Please specify a --base_model, e.g. --base_model='decapoda-research/llama-7b-hf'"

    tokenizer = AutoTokenizer.from_pretrained(base_model)
    if device == "cuda":
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            load_in_8bit=load_8bit,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        model = PeftModel.from_pretrained(
            model,
            lora_weights,
            torch_dtype=torch.float16,
            device_map={'': 0}
        )
    elif device == "mps":
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            device_map={"": device},
            torch_dtype=torch.float16,
        )
        model = PeftModel.from_pretrained(
            model,
            lora_weights,
            device_map={"": device},
            torch_dtype=torch.float16,
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            base_model, device_map={"": device}, low_cpu_mem_usage=True
        )
        model = PeftModel.from_pretrained(
            model,
            lora_weights,
            device_map={"": device},
        )

    tokenizer.padding_side = "left"


    # unwind broken decapoda-research config
    model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
    model.config.bos_token_id = 1
    model.config.eos_token_id = 2

    if not load_8bit:
        model.half()  # seems to fix bugs for some users.

    name_ids = [tokenizer.encode(name, add_special_tokens=False)]
    print(name_ids)

    prompt = [generate_prompt("Given a list of books the user has read before, please recommend a new book that the user likes to read.", "The user has read the following books before:\"Understanding Elizabeth\", \"Earthly Remains: A Commissario Guido Brunetti Mystery\", \"The Case of the Green-Dressed Ghost (Dr Ribero's Agency of the Supernatural)\", \"Digging In: A Novel\", \"Alone with Mr. Darcy: A Pride & Prejudice Variation\", \"The Crusader's Bride: The Champions of Saint Euphemia Book 1\", \"My Lady Thief\", \"A Quiet Life in the Country (A Lady Hardcastle Mystery)\", \"The Unwanted Heiress (The Archer Family Regency Romances) (Volume 1)\", \"The Irish Inheritance: A Jayne Sinclair Genealogical Mystery\"\n ")]
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, add_special_tokens=False).to(device)
    print(inputs.input_ids)


def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  

### Instruction:
{instruction}

### Input:
{input}

### Response:
"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.  

### Instruction:
{instruction}

### Response:
"""


if __name__ == "__main__":
    fire.Fire(main)


In [2]:
import json
import pandas as pd
import random
import numpy as np
def csv_to_json(input_path, output_path, sample=False):
    data = pd.read_csv(input_path)
    if sample:
        data = data.sample(n=40000, random_state=42).reset_index(drop=True)
        data.to_csv(output_path[:-5] + ".csv", index=False)
    json_list = []
    data = data[:40000]
    data.to_csv(output_path[:-5] + ".csv", index=False)
    for index, row in data.iterrows():
        row['history_item_id'] = eval(row['history_item_id'])
        row['history_item_title'] = eval(row['history_item_title'])
        L = len(row['history_item_id'])
        history = "The user has read the following books before:"
        for i in range(L):
            if i == 0:
                history += "\"" + row['history_item_title'][i] + "\""
            else:
                history += ", \"" + row['history_item_title'][i] + "\""
        target_item_name = "\"" + item_dict[str(row['item_id'])] + "\""
        json_list.append({
            "instruction": "Given a list of books the user has read before, please recommend a new book that the user likes to read.",
            "input": f"{history}\n ",
            "output": target_item_name,
        })
        
    with open(output_path, 'w') as f:
        json.dump(json_list, f, indent=4)


In [None]:
# csv_to_json('./train/Books_5_2017-10-2018-11.csv', './train.json')
# csv_to_json('./valid/Books_5_2017-10-2018-11.csv', './valid.json')
# csv_to_json('./test/Books_5_2017-10-2018-11.csv', './test.json')
# csv_to_json('./valid/Books_5_2017-10-2018-11.csv', './valid_5000.json', sample=True)
# csv_to_json('./test/Books_5_2017-10-2018-11.csv', './test_5000.json', sample=True)

In [None]:
# csv_to_json('./train/Books_5_2017-10-2018-11.csv', './train_1024.json', sample=True)
# csv_to_json('./valid/Books_5_2017-10-2018-11.csv', './valid_128.json', sample=True)
# csv_to_json('./test/Books_5_2017-10-2018-11.csv', './test_128.json', sample=True)

In [3]:
csv_to_json('./train/Books_5_2017-10-2018-11.csv', './train_40000.json')