Referenced: https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface



In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead, AutoTokenizer
import accelerate
print(accelerate.__version__)

# !pip install transformers[torch] accelerate -U

Using device: cuda


  from .autonotebook import tqdm as notebook_tqdm


0.29.2


In [4]:

import pandas as pd
from sklearn.model_selection import train_test_split

# load dataset
df = pd.read_csv('entire_sephora_data.csv')
print("shape: ", df.shape)

# filter out incentivized reviews
df = df[df['incentivizedReview'] == False]

# combine inputs into a single str and prep outputs
df['inputs'] = df.apply(lambda row: f"Skin Type: {row['skinType']} | Skin Tone: {row['skinTone']} | Verified Purchaser: {row['verifiedPurchaser']} | Positive Feedback: {row['TotalPositiveFeedbackCount']} | Negative Feedback: {row['TotalNegativeFeedbackCount']} | Recommended: {row['IsRecommended']} | Product: {row['ProductDescription']}", axis=1)
df['outputs'] = df.apply(lambda row: f"Review: {row['reviewText']} | Rating: {row['rating']}", axis=1)

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['inputs'], df['outputs'], test_size=0.2, random_state=42)
print("X_Train: ", X_train.shape)
print("X_Test: ", X_test.shape)
print("Y_Train: ", y_train.shape)
print("Y_Test: ", y_test.shape)

print(X_train)
print(y_train[0])

shape:  (42140, 10)
X_Train:  (27940,)
X_Test:  (6986,)
Y_Train:  (27940,)
Y_Test:  (6986,)
4947                                                Skin Type: combination | Skin Tone: light | Verified Purchaser: False | Positive Feedback: 0 | Negative Feedback: 0 | Recommended: True | Product: What it is: A hydrating foundation that delivers buildable coverage for the face and body, resists heat and humidity, and leaves a luminous makeup look.Coverage: MediumFinish: NaturalFormulation: LiquidHighlighted Ingredients: - Squalane- Hyaluronic AcidIngredient Callouts: Free of parabens, formaldehydes, formaldehyde-releasing agents, phthalates, mineral oil, retinyl palmitate, oxybenzone, coal tar, hydroquinone, sulfates SLS & SLES, triclocarban, triclosan, and contains less than one percent synthetic fragrance. What Else You Need to Know: This foundation is formulated with 94 percent natural-origin ingredients and hyaluronic acid. It delivers intense hydration and a second-skin sensation, while p

In [5]:
import re
import json
from sklearn.model_selection import train_test_split

import pandas as pd
import csv

# load dataset
df = pd.read_csv('entire_sephora_data.csv')
print("shape: ", df.shape)

# filter out incentivized reviews
df = df[df['incentivizedReview'] == False]

# combine inputs into a single str and prep outputs
df['inputs'] = df.apply(lambda row: f"Skin Type: {row['skinType']} | Skin Tone: {row['skinTone']} | Verified Purchaser: {row['verifiedPurchaser']} | Positive Feedback: {row['TotalPositiveFeedbackCount']} | Negative Feedback: {row['TotalNegativeFeedbackCount']} | Recommended: {row['IsRecommended']} | Product: {row['ProductDescription']}", axis=1)
df['outputs'] = df.apply(lambda row: f"Review: {row['reviewText']} | Rating: {row['rating']}", axis=1)


X_train, X_test, y_train, y_test = train_test_split(df['inputs'], df['outputs'], test_size=0.2, random_state=42)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

def build_text_files(data_df, dest_path):
    with open(dest_path, 'w', encoding='utf-8') as f:
        for i, row in data_df.iterrows():
            # Here you explicitly format the string to include only the data you want
            input_text = row['inputs']  # Assuming 'inputs' is the column with combined input text
            output_text = row['outputs']  # Assuming 'outputs' is the column with the target review text
            # Combine input and output text with a special token or separator if needed
            summary = f"{input_text} <|endoftext|> {output_text}".strip()
            summary = re.sub(r"\s+", " ", summary)  # Simplify whitespace to a single space
            f.write(summary + "\n")  # Write each example on a new line

# train, test = train_test_split(dataset,test_size=0.15)

build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

print("output",y_test)


shape:  (42140, 10)
Train dataset length: 27940
Test dataset length: 6986
output 4079                                                                                                                                                                                                                                                                                                         Review: New favorite foundation. Easy to blend | Rating: 5
22748                                                                                                                          Review: I got it because it was being hyped up as the best concealer for dry skin and I was so disappointed trying it out because it separated horrendously and creased regardless of if I set it or not. Just not worth it. | Rating: 1
40202                                                                                                                                                          Review: I liked the coverage but it made

In [6]:

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 10000)
print(X_test.head(3))

4079                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Skin Type: combination | Skin Tone: medium | Verified Purchaser: False | Positive Feedback: 0 | Negative Feedback: 0 | Recommended: True | Product: What it is: A hydrating foundation that delivers buildable coverage for the face and body, resists heat and humidity, and leaves a luminous makeup look.Coverage: MediumFinish: NaturalFormulation: LiquidHighlighted Ingredients: - Squalane- Hyalu

In [7]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'


In [8]:
from transformers import TextDataset,DataCollatorForLanguageModeling
# was 128 before 64
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=64)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=64)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

try:
    train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)
except Exception as e:
    print("Error loading dataset:", e)




In [10]:


model = AutoModelWithLMHead.from_pretrained("openai-community/gpt2")
model.to(device)
training_args = TrainingArguments(
    output_dir="./gpt2-sephora", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
trainer.train()


Step,Training Loss
500,1.8928
1000,0.9071


KeyboardInterrupt: 

In [None]:
trainer.save_model()


In [None]:
from transformers import pipeline

review = pipeline('text-generation',model='./gpt2-sephora', tokenizer=tokenizer)

output = review(
    "Skin Type: combination | Skin Tone: medium | Verified Purchaser: False | Positive Feedback: 0 | Negative Feedback: 0 | Recommended: True | Product: What it is: A hydrating foundation that delivers buildable coverage for the face and body, resists heat and humidity, and leaves a luminous makeup look.Coverage: MediumFinish: NaturalFormulation: LiquidHighlighted Ingredients: - Squalane- Hyaluronic AcidIngredient Callouts: Free of parabens, formaldehydes, formaldehyde-releasing agents, phthalates, mineral oil, retinyl palmitate, oxybenzone, coal tar, hydroquinone, sulfates SLS & SLES, triclocarban, triclosan, and contains less than one percent synthetic fragrance. What Else You Need to Know: This foundation is formulated with 94 percent natural-origin ingredients and hyaluronic acid. It delivers intense hydration and a second-skin sensation, while providing buildable coverage that evens the skin and blurs imperfections for a wide range of effects, from a flawless no-makeup look to a high-perfection complexion. <|endoftext|>"
    , truncation=True,
    max_length=500)


# Save entire result
entire_result = output

review_text = entire_result[0]['generated_text'].split("Review:", 1)[1] if "Review:" in entire_result[0]['generated_text'] else "Review not found"

print(entire_result)



# sample outputs from repeatedly running:

1.  I have mixed feelings about how the formula works out and it feels heavier on the skin. That being said, I have to say it lasts so long and feels good on the skin. And is very buildable. I feel like it could work for me when I am on a hot weather day (e.g. hot summer) and sometimes it looks cakey on my face. I have very oily/combination/dry skin, probably at least with a bit more moisturizer. I wore this to the gym and nothing else. I would say it lasts a LONG time without oxidizing and without looking patchy at all so I would say it is pretty good for a combination skin day in the winter. Pros: a very fair skin tone/foundation and skin prep | Rating: 5

2. This foundation is truly incredible! I’m in love!!!! I can’t get enough ❤️ but so far this foundation keeps me matte when I’m not wearing primer. The first thing to note is that it feels very hydrated. I love it. It’s lightweight and lightweight! | Rating: 5




In [None]:
print(entire_result)


In [None]:
# Skin Type: dry | Skin Tone: mediumTan | Verified Purchaser: True | Positive Feedback: 1 | Negative Feedback: 0 | Recommended: True | Product: What it is: A hydrating, full-coverage, weightless, four-in-one, concealer that conceals, contours, highlights, and retouches in a wide range of flawless shades.Coverage: Full Finish: Natural Formulation: Liquid Highlighted Ingredients: - Coconut Water: Aids in delicately replenishing skin’s moisture levels. - Alpine Rose: Helps support skin’s health and resilience. - Hyaluronic Acid: Gives a smoother, more youthful appearance. Ingredient Callouts: Free of parabens. It is also vegan and cruelty-free.What Else You Need to Know: A little goes a long way with this long-wearing, buildable, multitasking concealer masks imperfections, smooths, and sculpts skin for natural-looking coverage. This formula is non-comedogenic and offers a crease-free, weightless wear that’s so naturally flawless they’ll think you were born this way.

review = pipeline('text-generation',model='./gpt2-sephora', tokenizer=tokenizer)

output = review(
    "Skin Type: dry | Skin Tone: mediumTan | Verified Purchaser: True | Positive Feedback: 1 | Negative Feedback: 0 | Recommended: True | Product: What it is: A hydrating, full-coverage, weightless, four-in-one, concealer that conceals, contours, highlights, and retouches in a wide range of flawless shades.Coverage: Full Finish: Natural Formulation: Liquid Highlighted Ingredients: - Coconut Water: Aids in delicately replenishing skin’s moisture levels. - Alpine Rose: Helps support skin’s health and resilience. - Hyaluronic Acid: Gives a smoother, more youthful appearance. Ingredient Callouts: Free of parabens. It is also vegan and cruelty-free.What Else You Need to Know: A little goes a long way with this long-wearing, buildable, multitasking concealer masks imperfections, smooths, and sculpts skin for natural-looking coverage. This formula is non-comedogenic and offers a crease-free, weightless wear that’s so naturally flawless they’ll think you were born this way <|endoftext|> "
    , truncation=True,
    max_length=500)


# Save entire result
entire_result2 = output

print(entire_result2)


### currently buggy attempt to generate and eval below. ive manually evaluated one singular example to have the following results:

BERTScore: Precision: 0.8690944314002991, Recall: 0.8788704872131348, F1: 0.8739551305770874
ROUGE scores: {'rouge-1': {'r': 0.3333333333333333, 'p': 0.21428571428571427, 'f': 0.26086956045368626}, 'rouge-2': {'r': 0.10714285714285714, 'p': 0.06521739130434782, 'f': 0.08108107637691774}, 'rouge-l': {'r': 0.2962962962962963, 'p': 0.19047619047619047, 'f': 0.23188405320730948}}
BLEU score: 1.5763766673080681e-78


BERTScore is by far the most promising, future steps are to investiage


In [None]:
# ONLY 5 EXAMPLES

!pip install bert-score

!pip install nltk bert-score rouge-score pandas transformers
!pip install rouge

from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import pipeline
from nltk.translate.bleu_score import corpus_bleu
from bert_score import score as bert_score
from rouge import Rouge
from tqdm import tqdm  # For progress bar

# Assuming the DataFrame df and a suitable tokenizer are already defined
X_train, X_test, y_train, y_test = train_test_split(df['inputs'], df['outputs'], test_size=0.2, random_state=42)

# Select a subset of 5 examples from the test set
X_test_subset = X_test.head(5)
y_test_subset = y_test.head(5)

# Setup the review generation pipeline
review_generator = pipeline('text-generation', model='./gpt2-sephora', tokenizer=tokenizer, truncation=True, max_length=500)

# Generate and trim predictions for the subset of X_test
trimmed_predictions = []
for text in tqdm(X_test_subset, desc="Generating and trimming predictions"):
    output = review_generator(text, max_length=500)  # Adjust max_length as needed
    generated_text = output[0]['generated_text']
    # review_start = generated_text.find("Review:") + len("Review:")
    # review_end = generated_text.find("\n", review_start)
    # review_text = generated_text[review_start:review_end].strip() if review_end != -1 else generated_text[review_start:].strip()

    trimmed_predictions.append(generated_text)
    print("review text: ", generated_text)

# Prepare references for evaluation
references_bleu = [[text.split()] for text in y_test_subset]
candidates_bleu = [text.split() for text in trimmed_predictions]

references_rouge = y_test_subset.tolist()
candidates_rouge = trimmed_predictions

# Compute BLEU score
bleu_score = corpus_bleu(references_bleu, candidates_bleu)
print(f"BLEU score: {bleu_score}")

# Compute BERTScore
P, R, F1 = bert_score(candidates_rouge, references_rouge, lang="en", verbose=True)
print(f"BERTScore: Precision: {P.mean()}, Recall: {R.mean()}, F1: {F1.mean()}")

# Compute ROUGE score
rouge = Rouge()
scores = rouge.get_scores(candidates_rouge, references_rouge, avg=True)
print(f"ROUGE scores: {scores}")


In [None]:
# Assuming the DataFrame df and a suitable tokenizer are already defined
X_train, X_test, y_train, y_test = train_test_split(df['inputs'], df['outputs'], test_size=0.2, random_state=42)

# Select a subset of 5 examples from the test set
X_test_subset = X_test.head(5)
y_test_subset = y_test.head(5)

# Setup the review generation pipeline
review_generator = pipeline('text-generation', model='./gpt2-sephora', tokenizer=tokenizer, truncation=True, max_length=500)

# Generate and trim predictions for the subset of X_test
trimmed_predictions = []
for text in tqdm(X_test_subset, desc="Generating and trimming predictions"):
    output = review_generator(text, max_length=500)  # Adjust max_length as needed
    generated_text = output[0]['generated_text']
    # review_start = generated_text.find("Review:") + len("Review:")
    # review_end = generated_text.find("\n", review_start)
    # review_text = generated_text[review_start:review_end].strip() if review_end != -1 else generated_text[review_start:].strip()

    print("entire text: ", output)

In [None]:
for text in trimmed_predictions:
  print(text)

for x_test in X_test_subset:
  print("x_Test", x_test)

for y_test in y_test_subset:
  print("y_Test", y_test)

In [None]:
review = pipeline('text-generation',model='./gpt2-sephora', tokenizer=tokenizer)

output = review(
    X_test_subset[0]+" <|endoftext|>"
    , truncation=True,
    max_length=500)


# Save entire result
entire_result = output



In [None]:
!zip -r ./gpt2-sephora.zip ./gpt2-sephora/ # attempt to downlaod weights


# BELOW WILL TIME OUT DUE TO MAXXING RAM USAGE :)

In [None]:
!pip install bert-score

!pip install nltk bert-score rouge-score pandas transformers
!pip install rouge
from nltk.translate.bleu_score import corpus_bleu
from bert_score import score as bert_score
from rouge import Rouge
from tqdm import tqdm  # for progress bars


review_generator = pipeline('text-generation', model='./gpt2-sephora', tokenizer=tokenizer, truncation=True, max_length=500)

trimmed_predictions = []
for text in tqdm(X_test, desc="Generating and trimming predictions"):
    output = review_generator(text, max_length=500)
    generated_text = output[0]['generated_text']
    review_start = generated_text.find("<|endoftext|>") + len("<|endoftext|>")
    review_end = generated_text.find("\n", review_start)
    review_text = generated_text[review_start:review_end].strip() if review_end != -1 else generated_text[review_start:].strip()
    trimmed_predictions.append(review_text)

# format for bleu and rouge may need updates
references_bleu = [[text.split()] for text in y_test]
candidates_bleu = [text.split() for text in trimmed_predictions]

references_rouge = y_test.tolist()
candidates_rouge = trimmed_predictions

# compute BLEU score
bleu_score = corpus_bleu(references_bleu, candidates_bleu)
print(f"BLEU score: {bleu_score}")

# compute BERTScore
P, R, F1 = bert_score(candidates_rouge, references_rouge, lang="en", verbose=True)
print(f"BERTScore: Precision: {P.mean()}, Recall: {R.mean()}, F1: {F1.mean()}")

# compute ROUGE score
rouge = Rouge()
scores = rouge.get_scores(candidates_rouge, references_rouge, avg=True)
print(f"ROUGE scores: {scores}")