
This notebook is for creating our dataset which will be used to train our model later on. We generate the dataset by using an 11B parameter instruction-based model and providing it specific prompts for inference.

In [None]:
# installing libraries

!pip install transformers
!pip install accelerate
!pip install google-colab
!pip install -q -U bitsandbytes
!pip install torch torch_xla
!pip install accelerate
!pip install bitsandbytes
!pip install transformers_stream_generator einops

In [2]:
# imports

import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig

from accelerate import init_empty_weights, load_checkpoint_and_dispatch, Accelerator

import torch
import torch_xla
import torch_xla.core.xla_model as xm

In [None]:
# mount to drive

from google.colab import drive
drive.mount('/content/drive')

In [4]:
# quantization to reduce precision and increase inference speed/efficiency

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [None]:
# creating model object and tokenizer

# name = "tiiuae/falcon-7b-instruct"
name = "Undi95/Mistral-11B-OmniMix9"
model = AutoModelForCausalLM.from_pretrained(name, quantization_config = quantization_config)
tokenizer = AutoTokenizer.from_pretrained(name)

In [None]:
import csv

prompt_types = ["General: ", "Ambience: ", "Food: ", "Service: "]

prompts = ["Please provide a one sentence, very short, objective summary for the restaurant based on the provided reviews. Ensure the summary focuses on different aspects as mentioned in the reviews, avoiding repetition and first-person narratives. Please do not include numerical ratings, but rather focus on qualitative descriptions that clearly convey the reviewers experiences and opinions.",

           "Based on the provided reviews, please give a one sentence, very short summary that exclusively focuses on the ambiance aspect of this restaurant. The summary should be objective and reflect only the reviewers' experiences and opinions about the ambiance quality, without including information about the food, service, or any other factors. Please avoid using first-person narratives or numerical ratings, and concentrate solely on qualitative descriptions of the ambience.",

           "Based on the provided reviews, please give a one sentence, very short summary that exclusively focuses on the food aspect of this restaurant. The summary should be objective and reflect only the reviewers' experiences and opinions about the food quality, without including information about the service, ambiance, or any other factors. Please avoid using first-person narratives or numerical ratings, and concentrate solely on qualitative descriptions of the food.",

           "Based on the provided reviews, please give a one sentence, very short summary that exclusively focuses on the service aspect of this restaurant. The summary should be objective and reflect only the reviewers' experiences and opinions about the service quality, without including information about the food, ambiance, or any other factors. Please avoid using first-person narratives or numerical ratings, and concentrate solely on qualitative descriptions of the service."]

reviews_csv = "/content/drive/MyDrive/review-summarizer/final_reviews.csv"
with open(reviews_csv, 'r') as reviews_file:
    reader = csv.reader(reviews_file)

    inferenced_summaries_csv = "/content/drive/MyDrive/review-summarizer/inferenced_summaries.csv"
    with open(inferenced_summaries_csv, 'w') as inferenced_summaries_file:
        writer = csv.writer(inferenced_summaries_file)

        header = next(reader)

        restaurant_num = 1
        for row in reader:
            print(f"iter {restaurant_num}: ")

            summary = ""
            for prompt_type, prompt in enumerate(prompts):
                input_text = row[0] + " " + prompt

                input_ids = tokenizer.encode(input_text, return_tensors='pt')
                input_ids = input_ids.to('cuda')

                attention_mask = torch.ones(input_ids.shape).to(torch.float16)
                attention_mask = attention_mask.to('cuda')

                # CANT MOVE MODEL TO CUDA BC NOT ENOUGH SPACE. 156MB / 14.7GB FREE
                # model.to('cuda')
                # print(f"model dev: {model.device}")

                with torch.no_grad():
                    output = model.generate(input_ids,
                                          attention_mask=attention_mask,
                                          max_length=2048,
                                          do_sample=True,
                                          top_k=10,
                                          num_return_sequences=1,
                                          eos_token_id=tokenizer.eos_token_id)

                    output_text = tokenizer.decode(output[0], skip_special_tokens=True)


                output_text = output_text[output_text.find(prompt) + len(prompt):]
                summary += prompt_types[prompt_type] + output_text
            print(summary + "\n")
            writer.writerow([row[0], summary])

            restaurant_num += 1
