In [1]:

!pip install datasets transformers jsonlines  bitsandbytes  accelerate






In [2]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, GenerationConfig
import transformers
import torch
import pandas as pd
import numpy as np
from pprint import pprint
import itertools
import jsonlines


In [3]:
model = "tiiuae/falcon-7b-instruct"

In [4]:
dataset_train = load_dataset("craigslist_bargains",split="train")
dataset_train

Dataset({
    features: ['agent_info', 'agent_turn', 'dialogue_acts', 'utterance', 'items'],
    num_rows: 5247
})

In [5]:
role = np.array([])
for i in range(5247):
    role = np.append(role,dataset_train[i]['agent_info']['Role'])
unique = np.unique(role)
unique

array(['buyer', 'seller'], dtype='<U32')

In [6]:
dataset_train[1]

{'agent_info': {'Bottomline': ['None', 'None'],
  'Role': ['buyer', 'seller'],
  'Target': [120.0, 200.0]},
 'agent_turn': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1],
 'dialogue_acts': {'intent': ['intro',
   'unknown',
   'inquiry',
   'disagree',
   'init-price',
   'vague-price',
   'counter-price',
   'counter-price',
   'counter-price',
   'agree',
   'agree',
   'offer',
   'accept'],
  'price': [-1.0,
   -1.0,
   -1.0,
   -1.0,
   100.0,
   -1.0,
   120.0,
   150.0,
   145.0,
   -1.0,
   -1.0,
   145.0,
   -1.0]},
 'utterance': ['Hi,I am interested in your board!',
  'Hi, do you have any questions?',
  'Do you know what wood it is made out of?',
  'I do not know specifically but the brand is a gravity skateboard. ',
  'The wheels seem nice on it, but they could be a beter quality. Would you accept 100 for the board?',
  "No, that offer is too low. The board is pretty much brand new as it's been ridden only 4 or 5 times.  The bone bearings are brand new.",
  'If I picked it up at yo

In [7]:
def build_prompt_template(data):
    # Extract relevant information from the data
    agent_info = data["agent_info"]
    agent_turn = data["agent_turn"]
    dialogue_acts = data["dialogue_acts"]
    items = data["items"]
    utterance = data["utterance"]

    # reoving image from items
    items.pop('Images')

    # adding target atribut to items
    items['target'] = agent_info['Target']

    # Initialize the prompt template
    prompt_template = f""" given below is the negotiation between buyer and seller:
    ### item_details: {items},\n
    """


    # Append dialogue between buyer and seller
    for i, turn in enumerate(agent_turn):
        if turn == 0:
            role = "Buyer"
        else:
            role = "Seller"

        # Extract intent and price from dialogue acts
        intent = dialogue_acts["intent"][i]
        price = dialogue_acts["price"][i]

        # Append dialogue to the template
        dialogue_line = f"###{role}: {utterance[i]}\n"
        if intent:
            dialogue_line += f"Intent: {intent}\n"
        if price != -1.0:
            dialogue_line += f"Price: {price}\n"

        prompt_template += dialogue_line

    return prompt_template



In [8]:
example_data = dataset_train[0]

# Build the prompt template
prompt_template = build_prompt_template(example_data)

# Print the formatted prompt template


print(prompt_template)



 given below is the negotiation between buyer and seller:
    ### item_details: {'Category': ['phone', 'phone'], 'Price': [10.0, 10.0], 'Description': ['Charge two devices simultaneously on the go. This vehicle charger with an additional USB port delivers enough power to charge two devices at once. The push-button activated LED connector light means no more fumbling in the dark trying to connect your device. Auto Detect IC Technology automatically detects the device type and its specific charging needs for improved compatibility. And the built-in indicator light illuminates red to let you know the charger is receiving power and the power socket is working properly.', 'Charge two devices simultaneously on the go. This vehicle charger with an additional USB port delivers enough power to charge two devices at once. The push-button activated LED connector light means no more fumbling in the dark trying to connect your device. Auto Detect IC Technology automatically detects the device type 

In [9]:
type(prompt_template)

str

In [10]:
num_examples = len(dataset_train)
processed_dataset_train = []
for i in range(num_examples):
  processed_dataset_train.append({build_prompt_template(dataset_train[i])})


In [11]:
pprint(processed_dataset_train[0])

{' given below is the negotiation between buyer and seller:\n'
 "    ### item_details: {'Category': ['phone', 'phone'], 'Price': [10.0, "
 "10.0], 'Description': ['Charge two devices simultaneously on the go. This "
 'vehicle charger with an additional USB port delivers enough power to charge '
 'two devices at once. The push-button activated LED connector light means no '
 'more fumbling in the dark trying to connect your device. Auto Detect IC '
 'Technology automatically detects the device type and its specific charging '
 'needs for improved compatibility. And the built-in indicator light '
 'illuminates red to let you know the charger is receiving power and the power '
 "socket is working properly.', 'Charge two devices simultaneously on the go. "
 'This vehicle charger with an additional USB port delivers enough power to '
 'charge two devices at once. The push-button activated LED connector light '
 'means no more fumbling in the dark trying to connect your device. Auto '
 'Dete

In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model)


# Define a preprocess function that takes a single example
def preprocess_function(example):
    # Tokenize the example using your tokenizer
    return tokenizer(example, truncation=True, return_tensors="tf")




Downloading (…)okenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

In [13]:

# Iterate through your list and apply the preprocess function to each item
tokenized_data = [preprocess_function([str(item)]) for item in processed_dataset_train]


In [14]:
tokenized_data[0]

{'input_ids': <tf.Tensor: shape=(1, 569), dtype=int32, numpy=
array([[14616,  2132,  2249,   304,   248, 28806,  1192, 12639,   273,
        13702, 24626,    89,   296, 19468,  2154,    74, 32163,    37,
          204, 26175, 15448,  6061,   204,  4482,  4606,  1751,   204,
           18,  4606, 17401,   204,    18, 11668,  6061,   204,    70,
          696,    25,    27,    23,   204,   696,    25,    27,  3055,
          204,    18,  7587,  6061,   204,  4482, 57846,   847,  4398,
        13932,   313,   248,   471,    25,   735,  4627, 25804,   335,
          267,  3149,  9302,  2043, 13490,  1713,  1484,   271,  4189,
          847,  4398,   388,  1960,    25,   390,  4637,    24, 13663,
        17995,  9428, 24907,  1547,  1877,   658,   520,   261, 18628,
          272,   248,  3073,  2296,   271,  1734,   402,  3605,    25,
         9501, 49388, 12871,  6391,  6679, 47003,   248,  3605,  1842,
          273,   701,  2005, 11936,  1850,   312,  6899, 21612,    25,
          899, 

In [None]:


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model.config.use_cache = False

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading (…)figuration_falcon.py:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.



Downloading (…)n/modeling_falcon.py:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]