In [6]:
!pip install --upgrade git+https://github.com/huggingface/transformers
!pip install bitsandbytes
!pip install accelerate
!pip install auto-gptq
!pip install optimum
!pip install contractions

Collecting optimum
  Downloading optimum-1.14.0.tar.gz (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting coloredlogs
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf
  Downloading protobuf-4.25.0-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.4/294.4 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting humanfriendly>=9.1
  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Build

In [2]:
import torch
import transformers
from torch import cuda, bfloat16

torch.cuda.set_device(0)

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
device_name = torch.cuda.get_device_name()
print(f"Using device: {device} ({device_name})")



Using device: cuda:0 (NVIDIA RTX A6000)


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch.fx
def initiateModelAndTokenizer():

    model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                device_map = "auto",
                                                trust_remote_code = True,
                                                revision = "main")

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast = True)

    return model, tokenizer

# Inicjuje model i tokenizer
model, tokenizer = initiateModelAndTokenizer()

Downloading model.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [4]:
import pandas as pd
import numpy as np
import contractions
import re
import json
import os
from json import JSONDecodeError

In [5]:
!ls

 batch1.ipynb   fastbook     requirements.txt	    run.sh
 env.yml        miniconda3  'reviews (1).parquet'


In [6]:
# KONFIGURACJA

# Ścieżka do katalogu ze skryptem - punkt odniesienia do tworzenia pozostałych katalogów
#script_dir = "/content/drive/MyDrive/Colab Notebooks/revbox_local/"
script_dir = ""

# Ścieżka do pliku ze wszystkimi recenzjami
reviews_path = "reviews.parquet"

# Liczba recenzji, po których zapisuje aktualne wyniki
save_checkpoint = 50

In [11]:
# DEFINICJE FUNKCJI

def extractFeatures(review):

    user_prompt = f"""
    Extract most important product features from provided review.
    List all extracted features.
    Write the response as a Python list of dictionaries with three key-value pairs each.
    First key should be called "category" and value should be feature category.
    Second key should be called "quote" and value should be quote from review.
    Third key should be called "sentiment" and value should be "positive", "negative" or "neutral" depending on sentiment of the quoted text.
    Do not name the dictionary.
    Do not make up features if you will not find anything in review.
    The list may have one or more dictionaries.
    The list can be empty if there are no features mentioned in provided review.

    This is the review:

    {review}
    """

    prompt = f"""
    [INST] <<SYS>>
    You are an objective assistant and your task is to help with extracting important information from product reviews.
    You only analyze information stated in provided text.
    You focus only on product features.
    If you cannot extract any product features then do not make up features which were not mentioned in provided text.
    <</SYS>>
    {user_prompt}[/INST]
    """

    input_ids = tokenizer(prompt, return_tensors = "pt").input_ids.cuda()
    output = model.generate(inputs = input_ids,
                            temperature = 0.01,
                            do_sample = True,
                            top_p = 0.01,
                            top_k = 10,
                            max_new_tokens = 512)

    result = tokenizer.decode(output[0])
    #print(result)

    result_list = convertResultToListOfDictionaries(result)

    return result_list



def countReviewsPerProduct():

    products = reviews.groupby(["product_name"]).agg({"product_name": np.size})
    products.columns = ["number_of_reviews"]
    products = products.reset_index()
    products = products.sort_values(by = "number_of_reviews")
    products = products.reset_index(drop = True)

    return products



def cleanText(text):

    # Zamienia powielone kropki na pojedynczą kropkę i spację
    text = re.sub("\.+", ". ", text)

    # Usuwa powielone spacje
    text = re.sub(" +", " ", text)

    # Usuwa white spaces z początku i końca
    text = text.strip()

    # Rozwija skróty, np. you're = you are
    text = contractions.fix(text)

    # Zamienia cudzysłów na apostrof
    text = text.replace("\"", "'")

    return text



def convertResultToListOfDictionaries(result):

    # Wyciąga z odpowiedzi fragment z kodem i konwertuje go do listy słowników
    result = result.split("[/INST]")[1]
    result = result.split("[")[1]
    result = result.split("]")[0]
    result = "[" + result.strip() + "]"
    result = re.sub("},]", "}]", result)

    try:
        result_list = json.loads(result)
    except JSONDecodeError:
        with open(f"{script_dir}errors/{filename}.txt", "a") as f:
                    f.write(f"review_index: {r}\nJSONDecodeError:\n{result}\n\n\n")
        return result

    return result_list

In [12]:
# Tworzy katalog z informacjami o błędach
if not os.path.exists(script_dir + "errors"):
    os.makedirs(script_dir + "errors")

# Tworzy katalog z wynikami
if not os.path.exists(script_dir + "results"):
    os.makedirs(script_dir + "results")

# Wczytuje wszystkie recenzje z pliku
reviews = pd.read_parquet(reviews_path)

# Sprawdza czy istnieje plik z informacją o tym ile recenzji zostało już przetworzonych,
# jeśli nie ma, to tworzy go od nowa
if not os.path.exists(script_dir + "products.csv"):
    # Grupuje produkty według liczby recenzji
    products = countReviewsPerProduct()
    products["processed_reviews"] = 0
    products["next_index"] = 0
    products.to_csv(script_dir + "products.csv", index = False)
else:
    products = pd.read_csv(script_dir + "products.csv")

In [None]:
# Dla każdego produktu
counter = 0
import datetime
for p in range(len(products)):

    # Sprawdza czy wszystkie recenzje danego produktu zostały już przetworzone,
    # niektóre recenzje mogły być pominięte ze względu na błędy,
    # dlatego sprawdza jaki powinien być indeks następnej recenzji (początek od 0)
    if products.loc[p, "next_index"] < products.loc[p, "number_of_reviews"]:

        product = products.loc[p, "product_name"]

        # Wydziela wszystkie recenzje wybranego produktu
        product_reviews = reviews.loc[reviews["product_name"] == product, ["review_title", "review_content"]].copy()
        product_reviews = product_reviews.reset_index(drop = True)

        # Czyści teksty recenzji
        product_reviews["review_content"] = product_reviews["review_content"].apply(cleanText)

        # Sprawdza czy istnieje już plik z wyekstraktowanymi cechami danego produktu
        filename = re.sub(" ", "_", product)
        if not os.path.exists(script_dir + "results/" + filename + ".csv"):
            # Tworzy dataframe, do którego będą dopisywane kategorie cech produktu oraz cytaty z recenzji
            features = pd.DataFrame()
        else:
            features = pd.read_csv(script_dir + "results/" + filename + ".csv")

        start_index = products.loc[p, "next_index"]
        processed_reviews = products.loc[p, "processed_reviews"]

        # Dla każdej recenzji
        for r in range(start_index, len(product_reviews)):
            review = product_reviews.loc[r, "review_content"]
            now = datetime.datetime.now()
            print(counter, now,  review)
            counter = counter +1    
            # Stosuje algorytm do ekstrakcji cech i dopisuje do zbiorczej tabeli
            try:
                features_list = extractFeatures(review)
                # Jeśli zwróciło string zamiast listy, tzn. że był problem z konwersją
                # i przechodzi do kolejnej recenzji (informacje o błędzie są zapisywane)
                if type(features_list) == str:
                    continue
                features_tmp = pd.DataFrame(features_list)
                features = pd.concat([features, features_tmp])
            except Exception as e:
                error = type(e).__name__ + ":\n" + str(e)
                with open(f"{script_dir}errors/{filename}.txt", "a") as f:
                    f.write(f"review_index: {r}\n{error}\n\n\n")
                continue

            # Zapisuje aktualne wyniki jeśli osiągnęło ustalony checkpoint
            processed_reviews = processed_reviews + 1
            if processed_reviews % save_checkpoint == 0:
                features.to_csv(script_dir + "results/" + filename + ".csv", index = False)
                products.loc[p, "processed_reviews"] = processed_reviews
                products.loc[p, "next_index"] = r + 1
                products.to_csv(script_dir + "products.csv", index = False)

        features = features.reset_index(drop = True)
        features["category"] = features["category"].str.lower()
        features["sentiment"] = features["sentiment"].str.lower()

        # Zapisuje całość wyników dla danego produktu
        features.to_csv(script_dir + "results/" + filename + ".csv", index = False)
        products.loc[p, "processed_reviews"] = processed_reviews
        products.loc[p, "next_index"] = len(product_reviews)
        products.to_csv(script_dir + "products.csv", index = False)

0 2023-11-07 14:38:53.250400 I bought this phone as a replacement to Samsung. Since I am using it is been a night mare to keep it connected to cellular service. It keeps losing cellular connection with an E symbol. I need to reboot this phone every time to keep it connected to LTE service. I can not return this phone now or Helpdesk is able to help me to resolve this issue.
1 2023-11-07 14:38:57.664676 after the phone updates it informs you that it is now compatible with verizons network. Had to return. It worked great before the update though.
2 2023-11-07 14:39:03.779616 GREAT PHONE! Does everything & anything YOU want to in a cell phone. great value & a great batttery-lasts all day & night. better value than Iphone 13. and much. much. cheaper!!!
3 2023-11-07 14:39:14.700524 It is Great! Holds a charge for a couple of days. It is quick. My son's really likes it. Great price too.
4 2023-11-07 14:39:20.255604 I am enjoying my phone , it is working smooth and it is fast when I load the 