In [1]:
!pip install torch
!pip install transformers
#   this is the huggingface transformer module
#   https://github.com/huggingface/transformers/tree/main
#   https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/tokenization_t5.py
!pip install pytorch_lightning
!pip install sentencepiece datasets seqeval
#   Fixes error: T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
#   installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
#   that match your environment. Please note that you may need to restart your runtime after installation.
!pip install tensorboardX
#   Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, 
#   due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use 
#   `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. 
#   Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default



In [2]:
import pandas as pd

In [3]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

#import nltk
#nltk.download('punkt')
#from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)

In [4]:
# show entire column instead of truncating it
pd.set_option('display.max_colwidth', None)

# Load Datasets

In [5]:
# moved to CustomDataset.py to fix the error:
# --> self = reduction.pickle.load(from_parent)
#     AttributeError: Can't get attribute 'CustomDataset' on <module '__main__' (built-in)>
# see also https://github.com/Lightning-AI/pytorch-lightning/discussions/15350

from CustomDataset import CustomDataset

In [57]:
from enum import Enum

class DatasetOptions(Enum):
    WIKIDATA = 'wikidata', # Text2KG Benchmark training dataset 1
    DBPEDIA = 'dbpedia', # Text2KG Benchmark training dataset 2
    AMAZON = 'amazon' # unlabeled

TRAINING_DATASET = DatasetOptions.DBPEDIA
print(TRAINING_DATASET)

DatasetOptions.DBPEDIA


In [7]:
# loads into one dataframe all the .jsonl files in the file_list located under the given folder
def load_dataframe_from_jsonl(folder, file_list):
    file = folder + file_list[0]
    
    # open the first file and read it into a dataframe
    with open(file, "r"):
        df = pd.read_json(file, lines=True)
        
    # append the rest of the files into the same dataframe
    for filename in file_list:
        if filename == file_list[0]:
            # we already added this one
            continue
        file2 = folder + filename
        df2 = pd.read_json(file2, lines=True)
        df = pd.concat([df, df2])
    df.reset_index(drop=True, inplace=True) # use one continuous index
    return df

In [8]:
# usage: tokenizer = AutoTokenizer.from_pretrained("t5-small")
#        input_dataset = tokenize_dataset(tokenizer=tokenizer, dataset=dataset, type_path="train")
#
# type_path: torch.Dataset type_path parameter. "train", "test", "val"
def tokenize_dataset(tokenizer, dataset, type_path):
    custom_dataset = CustomDataset(tokenizer=tokenizer, dataset=dataset, type_path=type_path)
    if type_path == "train": # Only need to tokenize & pad training data
        # dunno what this is doing
        for i in range(len(custom_dataset)):
            _ = custom_dataset[i]
        tokenized_dataset = custom_dataset[0]
        print(tokenizer.decode(tokenized_dataset["source_ids"], skip_special_tokens=False))
        print(tokenizer.decode(tokenized_dataset["target_ids"], skip_special_tokens=False))
        return custom_dataset
    else:
        return custom_dataset

## Text2KG Benchmark

### DBPedia-webnlg

In [9]:
# Relative folder path. Expects Text2KGBench-main folder to be at the same level
# as this .ipynb file
dbpedia_folder = 'Text2KGBench-main/data/dbpedia_webnlg/' #baselines/prompts/
dbpedia_subfolder_train = 'train/'
dbpedia_subfolder_test = 'ground_truth/'
dbpedia_filenames = [
'ont_1_university',
'ont_2_musicalwork',
'ont_3_airport',
'ont_4_building',
'ont_5_athlete',
'ont_6_politician',
'ont_7_company',
'ont_8_celestialbody',
'ont_9_astronaut',
'ont_10_comicscharacter',
'ont_11_meanoftransportation',
'ont_12_monument',
'ont_13_food',
'ont_14_writtenwork',
'ont_15_sportsteam',
'ont_16_city',
'ont_17_artist',
'ont_18_scientist',
'ont_19_film',
]
dbpedia_ender_prompts = '_prompts.json'
dbpedia_ender_train = '_train.jsonl'
dbpedia_ender_test = '_ground_truth.jsonl'

In [10]:
df_dbpedia_train = load_dataframe_from_jsonl(
    dbpedia_folder + dbpedia_subfolder_train
    ,[s + dbpedia_ender_train for s in dbpedia_filenames]
)
df_dbpedia_test = load_dataframe_from_jsonl(
    dbpedia_folder + dbpedia_subfolder_test
    ,[s + dbpedia_ender_test for s in dbpedia_filenames]
)

In [11]:
df_dbpedia_train.head()

Unnamed: 0,id,sent,triples
0,ont_1_university_train_1,"1 Decembrie 1918 University is located in Alba, Romania and its Latin name is ""Universitas Apulensis"".","[{'sub': '1_Decembrie_1918_University', 'rel': 'latinName', 'obj': '""Universitas Apulensis""'}, {'sub': '1_Decembrie_1918_University', 'rel': 'country', 'obj': 'Romania'}, {'sub': '1_Decembrie_1918_University', 'rel': 'state', 'obj': 'Alba'}]"
1,ont_1_university_train_2,"The 1 Decembrie 1918 University is located in Alba Iulia, Alba. Its nickname is Uab.","[{'sub': '1_Decembrie_1918_University', 'rel': 'nickname', 'obj': 'Uab'}, {'sub': '1_Decembrie_1918_University', 'rel': 'city', 'obj': 'Alba_Iulia'}, {'sub': '1_Decembrie_1918_University', 'rel': 'state', 'obj': 'Alba'}]"
2,ont_1_university_train_3,"The nickname of the 1 Decembrie 1918 University is Uab. The latin name is ""Universitas Apulensis"" and the rector is Breaz Valer Daniel.","[{'sub': '1_Decembrie_1918_University', 'rel': 'nickname', 'obj': 'Uab'}, {'sub': '1_Decembrie_1918_University', 'rel': 'rector', 'obj': '""Breaz Valer Daniel""'}, {'sub': '1_Decembrie_1918_University', 'rel': 'latinName', 'obj': '""Universitas Apulensis""'}]"
3,ont_1_university_train_4,"1 Decembrie 1918 University is located in Alba Iulia, Romania and its rector is Breaz Valer Daniel.","[{'sub': '1_Decembrie_1918_University', 'rel': 'rector', 'obj': '""Breaz Valer Daniel""'}, {'sub': '1_Decembrie_1918_University', 'rel': 'city', 'obj': 'Alba_Iulia'}, {'sub': '1_Decembrie_1918_University', 'rel': 'country', 'obj': 'Romania'}]"
4,ont_1_university_train_5,The Accademia di Architettura di Mendrisio is in Mendrisio. It has 600 students and an academic staff of 100.,"[{'sub': 'Accademia_di_Architettura_di_Mendrisio', 'rel': 'city', 'obj': 'Mendrisio'}, {'sub': 'Accademia_di_Architettura_di_Mendrisio', 'rel': 'numberOfStudents', 'obj': '600'}, {'sub': 'Accademia_di_Architettura_di_Mendrisio', 'rel': 'academicStaffSize', 'obj': '100'}]"


### Wikidata Tekgen

In [12]:
wikidata_folder = 'Text2KGBench-main/data/wikidata_tekgen/'
wikidata_subfolder_prompts = 'baselines/prompts/'
wikidata_subfolder_train = 'train/'
wikidata_subfolder_test = 'ground_truth/'
wikidata_filenames = [
'ont_1_movie',
'ont_2_music',
'ont_3_sport',
'ont_4_book',
'ont_5_military',
'ont_6_computer',
'ont_7_space',
'ont_8_politics',
'ont_9_nature',
'ont_10_culture'
]
wikidata_ender_prompts = '_prompts.json'
wikidata_ender_train = '_train.jsonl'
wikidata_ender_test = '_ground_truth.jsonl'

In [13]:
df_wikidata_train = load_dataframe_from_jsonl(
    wikidata_folder + wikidata_subfolder_train
    ,[s + wikidata_ender_train for s in wikidata_filenames]
)

df_wikidata_test = load_dataframe_from_jsonl(
    wikidata_folder + wikidata_subfolder_test
    ,[s + wikidata_ender_test for s in wikidata_filenames]
)

In [14]:
df_wikidata_train.head()

Unnamed: 0,id,sub_label,rel_label,obj_label,sent,sub,rel,obj
0,ont_1_movie_train_1,Urusei Yatsura 2: Beautiful Dreamer,director,Mamoru Oshii,"Urusei Yatsura 2: Beautiful Dreamer (Japanese: , Hepburn: Urusei Yatsura 2 ByÅ«tifuru DorÄ«mÄ) is a 1984 Japanese anime fantasy comedy film, directed by Mamoru Oshii.",Q1582185,P57,Q285084
1,ont_1_movie_train_2,She and Her Cat,director,Makoto Shinkai,"She and Her Cat (Japanese: , Hepburn: Kanojo to Kanojo no Neko), subtitled Their standing points, is a 1999 Japanese original video animation created and directed by Makoto Shinkai.",Q584204,P57,Q335080
2,ont_1_movie_train_3,Minimum Viable Product,director,Mike Judge,"The episode was written by series creators John Altschuler, Dave Krinsky and Mike Judge and directed by Judge.",Q16746501,P57,Q434585
3,ont_1_movie_train_4,Evangelion: 3.0 You Can (Not) Redo,director,Hideaki Anno,"(Q, Evangerion Shin GekijÅban: KyÅ«, ""Evangelion: The New Movie: Q"", where the ""Q"" stands for ""Quickening"") is a 2012 Japanese animated science fiction film written and chief directed by Hideaki Anno and the third of four films released in the Rebuild of Evangelion tetralogy, based on the original anime series Neon Genesis Evangelion.",Q182206,P57,Q23261
4,ont_1_movie_train_5,Evangelion: 2.0 You Can (Not) Advance,director,Hideaki Anno,Evangelion : 2.0 You Can ( Not ) Advance was produced and co-distributed by Hideaki Anno's Studio Khara in partnership with Gainax.,Q614200,P57,Q23261


In [15]:
df_wikidata_test.head()

Unnamed: 0,id,sent,triples
0,ont_1_movie_test_1,"Bleach: Hell Verse (Japanese: BLEACH , Hepburn: BurÄ«chi Jigoku-Hen) is a 2010 Japanese animated film directed by Noriyuki Abe.","[{'sub': 'Bleach : Hell Verse', 'rel': 'director', 'obj': 'Noriyuki Abe'}, {'sub': 'Bleach : Hell Verse', 'rel': 'publication date', 'obj': '01 January 2010'}]"
1,ont_1_movie_test_2,Keyboard Cat's original form was a video originally made in 1984 by Charlie Schmidt of his cat Fatso seemingly playing a piano (though manipulated by Schmidt off-camera) to a cheery tune.,"[{'sub': 'Keyboard Cat', 'rel': 'cast member', 'obj': 'Fatso the Cat'}, {'sub': 'Keyboard Cat', 'rel': 'director', 'obj': 'Charlie Schmidt'}]"
2,ont_1_movie_test_3,The series was directed by Mitsuko Kase (episodes 1-7) and Takashi Imanishi (episodes 8-13).,"[{'sub': 'Mobile Suit Gundam 0083 : Stardust Memory', 'rel': 'director', 'obj': 'Takashi Imanishi'}]"
3,ont_1_movie_test_4,"Spirited Away (Japanese: , Hepburn: Sen to Chihiro no Kamikakushi, ""Sen and Chihiros Spiriting Away"") is a 2001 Japanese animated fantasy film written and directed by Hayao Miyazaki, animated by Studio Ghibli for Tokuma Shoten, Nippon Television Network, Dentsu, Buena Vista Home Entertainment, Tohokushinsha Film and Mitsubishi, and distributed by Toho.","[{'sub': 'Spirited Away', 'rel': 'genre', 'obj': 'Fantasy film'}, {'sub': 'Spirited Away', 'rel': 'director', 'obj': 'Hayao Miyazaki'}, {'sub': 'Spirited Away', 'rel': 'publication date', 'obj': '20 July 2001'}, {'sub': 'Spirited Away', 'rel': 'production company', 'obj': 'Studio Ghibli'}, {'sub': 'Spirited Away', 'rel': 'screenwriter', 'obj': 'Hayao Miyazaki'}]"
4,ont_1_movie_test_5,Looney Tunes: Back in Action is a 2003 American live-action/animated comedy film directed by Joe Dante and written by Larry Doyle.,"[{'sub': 'Looney Tunes : Back in Action', 'rel': 'director', 'obj': 'Joe Dante'}, {'sub': 'Looney Tunes : Back in Action', 'rel': 'publication date', 'obj': '01 January 2003'}, {'sub': 'Looney Tunes : Back in Action', 'rel': 'publication date', 'obj': '04 December 2003'}]"


## Amazon Product Dataset

In [16]:
# associated word embeddings are fine-tuned or trained

# record different prompts tested
# missing data: category (different from main_cat of "Luxury Beauty"?

# dirty data like
# "\n    Product Dimensions: \n    ": "2.2 x 2.2 x 7 inches ; 8.8 ounces",
# "Shipping Weight:": "14.4 ounces (",

# remove trailing ' \(' in strings
# escape characters like &#039; but also non-escaped characters like &
# increase response length

In [17]:
amzn_folder = './' # current working directory
amzn_filenames = [
'sample_data_only_beauty_category.json'
]

In [18]:
amzn_data = load_dataframe_from_jsonl(
    amzn_folder
    ,[s for s in amzn_filenames]
)
# shape df_amzn in the format expected by our CustomDataset
# amzn_data.rename(columns={"description": "sent"}, inplace=True) -- A list of strings

# change description column from a list of strings as a single string
amzn_data['sent'] = amzn_data['description'].apply(lambda x: ' '.join(map(str, x)))
amzn_data.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,sent
0,[],,"[After a long day of handling thorny situations, our new hand therapy pump is just the help you need. It contains shea butter as well as extracts of yarrow, clover and calendula to help soothe and condition work-roughened hands., By Crabtree & Evelyn, The aromatic benefits of herbs are varied and far-reaching, so we combined a whole bunch of them into one restoratively fragrant line-up straight from the garden., We&#039;ve formulated our Gardeners Hand Therapy with Myrrh Extract to help condition nails and cuticles as well as skin super hydrators macadamia seed oil and shea butter to help replenish lost moisture. Rich in herbal extracts like cooling cucumber and rosemary leaf a favourite for antioxidants to help protect hands against daily urban and environmental stresses while the hydrating power of Vitamin E, Hyaluronic Acid and Ceramides contribute to improve the skins natural moisture barrier with this garden-inspired treatment. Skin is left silky-soft and delicately scented., How to use:, Dab a pea-sized amount to palms and work over skin and nails. Combine with Gardeners Hand Wash and Hand Scrub to get silky skin in three herb-infused steps., Originally created to appeal to a horticulturists wealth of knowledge about the healing power of herbs, this botanical range is formulated with cleansing cucumber extract, purifying rosemary extract, oak moss and refreshing sage extract., We search the world for natural ingredients and fragrance journeys that enable our customers to live a life cultivated. Inspired by the Crabapple Tree, the original species from which all cultivated apple trees have derived, and John Evelyn, the 17th century renaissance Englishman whose motto Explore Everything. Keep The Best has provided inspiration from our founding to this day.]",,Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ,"[B00GHX7H0A, B00FRERO7G, B00R68QXCS, B000Z65AZE, B07GFHJRMX, B074KGBGL7, B00R68QXJG, B00025WYZC, B07H3W9BM5, B00KOBT82G, B072N2M1P6, B071G8FG2N, B00FASVFI8, B00GHXE4N8, B00EPG2QJI, B01MQ4MEFE, B01M8ML0SY, B074KHCPLH, B004XQWY4W, B00FASV6UU, B01M31HJBJ, B00KC8TU7O, B00B9TU5T2, B00K75EZ04, B000Q2Y0FI, B00FEGOCCM, B00EPFXFBW, B00H6SQY3Q, B00HZAOWUC, B07GFJF1DN, B001WBS68E, B074KJZCPH]",,,[],"4,324 in Beauty & Personal Care (","[B00FRERO7G, B00GHX7H0A, B07GFHJRMX, B00TJ3NBN2, B00KOBT82G, B00R68QXCS, B074KGBGL7, B075MH4Q9L, B07H3W9BM5, B07GFJF1DN, B00KC8TPVA, B07DB7KXFV, B07DCCRGZT, B00GHX58LK, B077GXQ2TH, B00GHX52MK, B01MQ4MEFE, B00GHXE4N8, B07FYFXBK8, B00FEGOCCM, B00FASVFI8, B074KFH9JN, B071G8FG2N, B074KGN1BT, B00GHX5HZC, B00B9TU5T2, B074KM26WX, B074KGQ65V, B01M8ML0SY, B076YKGPY5, B00EPG2QJI, B074KHCPLH, B075YMZVGF, B00K1C8V1W, B074KDPT26, B07CCNVW87, B074KGQ5LF, B00GHX8I6M, B07JMLGRKY, B07C92VLKM, B00KC8TU7O, B00025WYZC, B074KJZCPH, B074KHCPMV, B00GHXHPEI, B07K2WRDBS, B00FASV6UU, B001WBS68E, B074KMD9QM, B076YN8DDY, B074KHDYRX, B00GHXIBGE]","{'  Product Dimensions: ': '2.2 x 2.2 x 7 inches ; 8.8 ounces', 'Shipping Weight:': '14.4 ounces (', 'Domestic Shipping: ': 'Item can be shipped within U.S.', 'International Shipping: ': 'This item can be shipped to select countries outside of the U.S.', 'ASIN:': 'B00004U9V2', 'Item model number:': '4113'}",Luxury Beauty,,NaT,$30.00,B00004U9V2,"[https://images-na.ssl-images-amazon.com/images/I/41ClX6BRvZL._SX50_SY65_CR,0,0,50,65_.jpg, https://images-na.ssl-images-amazon.com/images/I/510giIO5cFL._SX50_SY65_CR,0,0,50,65_.jpg, https://images-na.ssl-images-amazon.com/images/I/414gBlQ6F9L._SX50_SY65_CR,0,0,50,65_.jpg, https://images-na.ssl-images-amazon.com/images/I/51jNGOh1f9L._SX50_SY65_CR,0,0,50,65_.jpg, https://images-na.ssl-images-amazon.com/images/I/31f8YZgUBhL._SX50_SY65_CR,0,0,50,65_.jpg]","[https://images-na.ssl-images-amazon.com/images/I/41ClX6BRvZL.jpg, https://images-na.ssl-images-amazon.com/images/I/510giIO5cFL.jpg, https://images-na.ssl-images-amazon.com/images/I/414gBlQ6F9L.jpg, https://images-na.ssl-images-amazon.com/images/I/51jNGOh1f9L.jpg, https://images-na.ssl-images-amazon.com/images/I/31f8YZgUBhL.jpg]","After a long day of handling thorny situations, our new hand therapy pump is just the help you need. It contains shea butter as well as extracts of yarrow, clover and calendula to help soothe and condition work-roughened hands. By Crabtree & Evelyn The aromatic benefits of herbs are varied and far-reaching, so we combined a whole bunch of them into one restoratively fragrant line-up straight from the garden. We&#039;ve formulated our Gardeners Hand Therapy with Myrrh Extract to help condition nails and cuticles as well as skin super hydrators macadamia seed oil and shea butter to help replenish lost moisture. Rich in herbal extracts like cooling cucumber and rosemary leaf a favourite for antioxidants to help protect hands against daily urban and environmental stresses while the hydrating power of Vitamin E, Hyaluronic Acid and Ceramides contribute to improve the skins natural moisture barrier with this garden-inspired treatment. Skin is left silky-soft and delicately scented. How to use: Dab a pea-sized amount to palms and work over skin and nails. Combine with Gardeners Hand Wash and Hand Scrub to get silky skin in three herb-infused steps. Originally created to appeal to a horticulturists wealth of knowledge about the healing power of herbs, this botanical range is formulated with cleansing cucumber extract, purifying rosemary extract, oak moss and refreshing sage extract. We search the world for natural ingredients and fragrance journeys that enable our customers to live a life cultivated. Inspired by the Crabapple Tree, the original species from which all cultivated apple trees have derived, and John Evelyn, the 17th century renaissance Englishman whose motto Explore Everything. Keep The Best has provided inspiration from our founding to this day."
1,[],,"[If you haven't experienced the pleasures of bathing in the Dead Sea, Bath Crystals are the next best thing. Rich in health-inducing minerals including magnesium, calcium, sodium, potassium and more, they soothe your body with relaxation, easing muscle tension and softening your skin. Immerse yourself in the waters of well-being.]",,AHAVA Bath Salts,[],,,[],"1,633,549 in Beauty & Personal Care (",[],"{'  Product Dimensions: ': '3 x 3.5 x 6 inches ; 2.2 pounds', 'Shipping Weight:': '2.6 pounds', 'Domestic Shipping: ': 'Item can be shipped within U.S.', 'International Shipping: ': 'This item is not eligible for international shipping.', 'ASIN:': 'B0000531EN', 'Item model number:': '017N'}",Luxury Beauty,,NaT,,B0000531EN,[],[],"If you haven't experienced the pleasures of bathing in the Dead Sea, Bath Crystals are the next best thing. Rich in health-inducing minerals including magnesium, calcium, sodium, potassium and more, they soothe your body with relaxation, easing muscle tension and softening your skin. Immerse yourself in the waters of well-being."
2,[],,"[Rich, black mineral mud, harvested from the banks of the Dead Sea, is comprised of layer upon layer of sedimentary clay formed over thousands of years. Captured within is an extremely high concentration of minerals, scientifically proven to be essential in maintaining healthy skin. Ahava Black Mineral Mud works deep to clean, purify and restore the skin's natural moisture balance, leaving it smooth, radiant and revitalized., , ]",,"AHAVA Dead Sea Mineral Mud, 8.5 oz, Pack of 4",[],,,[],"1,806,710 in Beauty &amp; Personal Care (",[],"{'  Product Dimensions: ': '5.1 x 3 x 5.5 inches ; 2.48 pounds', 'Shipping Weight:': '2.6 pounds', 'Domestic Shipping: ': 'Item can be shipped within U.S.', 'International Shipping: ': 'This item is not eligible for international shipping.', 'ASIN:': 'B0000532JH', 'Item model number:': '018N'}",Luxury Beauty,,NaT,,B0000532JH,"[https://images-na.ssl-images-amazon.com/images/I/41O1luEZuHL._SX50_SY65_CR,0,0,50,65_.jpg]",[https://images-na.ssl-images-amazon.com/images/I/41O1luEZuHL.jpg],"Rich, black mineral mud, harvested from the banks of the Dead Sea, is comprised of layer upon layer of sedimentary clay formed over thousands of years. Captured within is an extremely high concentration of minerals, scientifically proven to be essential in maintaining healthy skin. Ahava Black Mineral Mud works deep to clean, purify and restore the skin's natural moisture balance, leaving it smooth, radiant and revitalized."
3,[],,"[This liquid soap with convenient pump dispenser is formulated with conditioning extracts of sage, rosemary, alfalfa, carrot, and cucumber. It deodorizes the skin and leaves it refreshed with a clean, herbal scent., You've watched your favorite gardeners spend hours lovingly pampering their plants, doting all weekend on tulips or carrots or tomatoes, but when was the last time they doted on themselves? Crabtree &amp; Evelyn comes to the rescue of gardeners' hands everywhere with their line of lavish skin-care products. Their hand soap is made from a vegetable-base blend and features sage, alfalfa, and cucumber extracts. It's packaged in a handy metal pump dispenser, and has been formulated to be gentle on a gardener's hands for pampering after a long day in the dirt. Combine this with other Crabtree &amp; Evelyn accessories like Gardeners Hand Therapy or Skin Remedy for a fantastic gift set. Indulge your favorite gardeners (or yourself) with this well-deserved treat. <i>--Ariel Meadow Stallings</i>, The aromatic benefits of herbs are varied and far-reaching, so we combined a whole bunch of them into one restoratively fragrant line-up straight from the garden., This cleansing, fragrant hand wash has the power to transport you away from the city and straight to the countryside thanks to its heady mix of herbal heavyweights. With clarifying cucumber, antioxidant-rich rosemary leaf and soothing aloe leaf juice, hands will be cleansed and revived with the freshest of scents. Like a restorative tonic, Gardeners Hand Soap is mild and gentle to leave you with petal-soft hands., How to use:, For thoroughly cleansed, silky soft hands, dab a pea-sized amount of soap onto skin and lather well under warm water. Rinse and pat dry. Combine the Hand Wash with our Exfoliating Hand Scrub and Moisturising Hand Therapy for the ultimate ritual., Originally created to appeal to a horticulturists wealth of knowledge about the healing power of herbs, this botanical range is formulated with cleansing cucumber extract, purifying rosemary extract, oak moss and refreshing sage extract., We search the world for natural ingredients and fragrance journeys that enable our customers to live a life cultivated. Inspired by the Crabapple Tree, the original species from which all cultivated apple trees have derived, and John Evelyn, the 17th century renaissance Englishman whose motto Explore Everything. Keep The Best has provided inspiration from our founding to this day., , ]",,"Crabtree &amp; Evelyn Hand Soap, Gardeners, 10.1 fl. oz.",[],,,[],[],"[B00004U9V2, B00GHX7H0A, B00FRERO7G, B00R68QXCS, B00KOBT82G, B071G8FG2N, B07FYFXBK8, B00TJ3NBN2, B07H3W9BM5, B074KGBGL7, B00EPG2QJI, B07GFJF1DN, B00GHXE4N8, B07DCCRGZT, B07GFHJRMX, B07BNL4LY4, B07JMLGRKY, B07DB7KXFV, B00R68QXJG, B00GHX58LK, B075MH4Q9L, B075YMT1ZY, B00K1C6D3A, B00KC8TPVA, B00GHX52MK, B074KDPT26, B074KJZCPH, B07CCNVW87, B074KK461V, B074KHCPLH, B00TJ3TF8C, B07LFXPK3N, B004MJVVBC, B0771SDCTB, B07CKMW2QH, B06XV5XTPQ, B0798FVV6V]","{'  Product Dimensions: ': '2.6 x 2.6 x 6.7 inches ; 1.5 pounds', 'Shipping Weight:': '12 ounces (', 'ASIN:': 'B00005A77F', 'Item model number:': '27810'}",Luxury Beauty,,NaT,$15.99,B00005A77F,"[https://images-na.ssl-images-amazon.com/images/I/31BBeRbXZsL._SX50_SY65_CR,0,0,50,65_.jpg, https://images-na.ssl-images-amazon.com/images/I/41Qwup7twjL._SX50_SY65_CR,0,0,50,65_.jpg]","[https://images-na.ssl-images-amazon.com/images/I/31BBeRbXZsL.jpg, https://images-na.ssl-images-amazon.com/images/I/41Qwup7twjL.jpg]","This liquid soap with convenient pump dispenser is formulated with conditioning extracts of sage, rosemary, alfalfa, carrot, and cucumber. It deodorizes the skin and leaves it refreshed with a clean, herbal scent. You've watched your favorite gardeners spend hours lovingly pampering their plants, doting all weekend on tulips or carrots or tomatoes, but when was the last time they doted on themselves? Crabtree &amp; Evelyn comes to the rescue of gardeners' hands everywhere with their line of lavish skin-care products. Their hand soap is made from a vegetable-base blend and features sage, alfalfa, and cucumber extracts. It's packaged in a handy metal pump dispenser, and has been formulated to be gentle on a gardener's hands for pampering after a long day in the dirt. Combine this with other Crabtree &amp; Evelyn accessories like Gardeners Hand Therapy or Skin Remedy for a fantastic gift set. Indulge your favorite gardeners (or yourself) with this well-deserved treat. <i>--Ariel Meadow Stallings</i> The aromatic benefits of herbs are varied and far-reaching, so we combined a whole bunch of them into one restoratively fragrant line-up straight from the garden. This cleansing, fragrant hand wash has the power to transport you away from the city and straight to the countryside thanks to its heady mix of herbal heavyweights. With clarifying cucumber, antioxidant-rich rosemary leaf and soothing aloe leaf juice, hands will be cleansed and revived with the freshest of scents. Like a restorative tonic, Gardeners Hand Soap is mild and gentle to leave you with petal-soft hands. How to use: For thoroughly cleansed, silky soft hands, dab a pea-sized amount of soap onto skin and lather well under warm water. Rinse and pat dry. Combine the Hand Wash with our Exfoliating Hand Scrub and Moisturising Hand Therapy for the ultimate ritual. Originally created to appeal to a horticulturists wealth of knowledge about the healing power of herbs, this botanical range is formulated with cleansing cucumber extract, purifying rosemary extract, oak moss and refreshing sage extract. We search the world for natural ingredients and fragrance journeys that enable our customers to live a life cultivated. Inspired by the Crabapple Tree, the original species from which all cultivated apple trees have derived, and John Evelyn, the 17th century renaissance Englishman whose motto Explore Everything. Keep The Best has provided inspiration from our founding to this day."
4,[],,"[Remember why you love your favorite blanket? The soft, comforting feeling of wrapping it around your shoulders gives you the instant happiness of a hug. Your hands deserve the same love. With every application, soy extract blended with dried milk solids and whipped to perfection greets your hands with loving hydration. A favorite among nutrition experts, soy extract is the primary ingredient in our Soy Milk Hand Crme. The proteins, amino acids and lipids in this high-powered bean allow for rapid hydration and skin regeneration. Natural jojoba esters and other premium ingredients are added to allow the cream to go on smoothly, without that greasy feel. The final result is a distinctive cream that you will love from the first time it's applied., Welcome to the world of Archipelago Botanicals - where warm candlelight, exquisite fragrance, and soothing products make a house a home., This beautician favorite collection features dried milk solids and natural proteins to gently nurture and soothe the skin, leaving it naturally soft and supple. Available in soy and oat proteins or in combination blends of the two., <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/e6bd20a0-5d75-4f4d-b4cb-621bfdf8d387._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br />, <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/d4c325ad-9d90-42f8-89a9-769fd1b3eda1._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br />, <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/cf60e661-f2c6-4e70-885b-e9e2009e8039._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br />, Soy protein makes the perfect additive for the skin! The amino acids and lipids found in this high-powered bean allow for rapid cell rehydration and regeneration. Our soy lotion is blended with dried milk solids to keep skin looking and feeling soft and supple. Natural jojoba esters and other premium ingredients are added to allow the lotion to go on smoothly, without that greasy feel. The final result is a distinctive lotion that youll appreciate the very first time that it touches your skin., Gentle Oat Proteins are the perfect solution for your dry skin! Our Oat Lotion is blended with dried Milk Solids to keep skin looking and feeling soft and supple. Natural Jojoba Esters and other premium ingredients, are added to allow the lotion to go on smoothly without leaving a greasy feel. The final result is a distinctive lotion that you will love the first time it's applied., The proteins, amino acids and lipids in this high-powered bean allow for rapid hydration and skin regeneration. Our hand cream is blended with dried milk solids to keep hands looking and feeling soft and supple. Our irresistibly scented hand treatment is highly recommended as part of your daily skin care routine for beautiful and soft hands., <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/4b4f2e9c-091d-4ca6-928e-cee8edf88107._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br />, <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/8b5e5e72-92c4-4678-97a4-94b9b4dee0c8._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br />, <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/905dc348-f67c-484b-80b9-fc0269e8df71._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br />, Our Oat Milk Hand Crme is perfect for dry or chapped hands! Gentle Oat Proteins are mixed with Aloe, resulting in one of the best moisturizing formulations on the market., Our Milk Hand Wash is the perfect soap to keep your hands looking healthy and clean. Dried Milk Solids and Natural Soy, Oat, and Rice proteins are combine to create this distinctive hand wash that you will love from the first time you use it. Apply liberally and work into a robust lather and rinse. Gentle enough for everyday use., Begin your daily beauty routine with a gentle, foaming cleanser from one of Archipelago Botanicals aromatherapy bath and body collections. This moisturizing body wash is highly recommended for dry skin. Also available in a larger, 33 oz bottle., , ]",,Soy Milk Hand Crme,"[B000NZT6KM, B001BY229Q, B008J724QY, B0009YGKJ2, B001JB55SQ, B000M3OR7C, B00J0A3ZCQ, B00SKBJ4L2, B00J0A3SMS, B008J720A4, B00J0A448K, B00NT183UQ, B01FCKKU3E, B01DSM1R6M, B001IJQR68, B01KZ20SZE, B002JU6IQO, B00J0A3JW2, B008J72D2Y, B003B3YBK8, B008J721L2, B0002PFDYQ, B00J9PYCJW, 0393326349, B001AH8CL6, B07HS8P7S4, B001IJOYJA, B00FJGGJXW, B000066SYB, B07BKNG24Z, B00GNW1MB0, B000YB6PQS, B00YWRKHPK]",,,[],"42,464 in Beauty &amp; Personal Care (",[],"{'  Product Dimensions: ': '7.2 x 2.2 x 7.2 inches ; 4 ounces', 'Shipping Weight:': '7.2 ounces (', 'Domestic Shipping: ': 'Currently, item can be shipped only within the U.S. and to APO/FPO addresses. For APO/FPO shipments, please check with the manufacturer regarding warranty and support issues.', 'International Shipping: ': 'This item can be shipped to select countries outside of the U.S.', 'ASIN:': 'B00005NDTD', 'Item model number:': '27418'}",Luxury Beauty,,NaT,$18.00,B00005NDTD,"[https://images-na.ssl-images-amazon.com/images/I/31agMAVCHtL._SX50_SY65_CR,0,0,50,65_.jpg, https://images-na.ssl-images-amazon.com/images/I/41xps4ua3ZL._SX50_SY65_CR,0,0,50,65_.jpg, https://images-na.ssl-images-amazon.com/images/I/413s80q%2BjRL._SX50_SY65_CR,0,0,50,65_.jpg, https://images-na.ssl-images-amazon.com/images/I/31GmuRIx5kL._SX50_SY65_CR,0,0,50,65_.jpg, https://images-na.ssl-images-amazon.com/images/I/31C6Z%2B9RuLL._SX50_SY65_CR,0,0,50,65_.jpg]","[https://images-na.ssl-images-amazon.com/images/I/31agMAVCHtL.jpg, https://images-na.ssl-images-amazon.com/images/I/41xps4ua3ZL.jpg, https://images-na.ssl-images-amazon.com/images/I/413s80q%2BjRL.jpg, https://images-na.ssl-images-amazon.com/images/I/31GmuRIx5kL.jpg, https://images-na.ssl-images-amazon.com/images/I/31C6Z%2B9RuLL.jpg]","Remember why you love your favorite blanket? The soft, comforting feeling of wrapping it around your shoulders gives you the instant happiness of a hug. Your hands deserve the same love. With every application, soy extract blended with dried milk solids and whipped to perfection greets your hands with loving hydration. A favorite among nutrition experts, soy extract is the primary ingredient in our Soy Milk Hand Crme. The proteins, amino acids and lipids in this high-powered bean allow for rapid hydration and skin regeneration. Natural jojoba esters and other premium ingredients are added to allow the cream to go on smoothly, without that greasy feel. The final result is a distinctive cream that you will love from the first time it's applied. Welcome to the world of Archipelago Botanicals - where warm candlelight, exquisite fragrance, and soothing products make a house a home. This beautician favorite collection features dried milk solids and natural proteins to gently nurture and soothe the skin, leaving it naturally soft and supple. Available in soy and oat proteins or in combination blends of the two. <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/e6bd20a0-5d75-4f4d-b4cb-621bfdf8d387._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br /> <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/d4c325ad-9d90-42f8-89a9-769fd1b3eda1._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br /> <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/cf60e661-f2c6-4e70-885b-e9e2009e8039._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br /> Soy protein makes the perfect additive for the skin! The amino acids and lipids found in this high-powered bean allow for rapid cell rehydration and regeneration. Our soy lotion is blended with dried milk solids to keep skin looking and feeling soft and supple. Natural jojoba esters and other premium ingredients are added to allow the lotion to go on smoothly, without that greasy feel. The final result is a distinctive lotion that youll appreciate the very first time that it touches your skin. Gentle Oat Proteins are the perfect solution for your dry skin! Our Oat Lotion is blended with dried Milk Solids to keep skin looking and feeling soft and supple. Natural Jojoba Esters and other premium ingredients, are added to allow the lotion to go on smoothly without leaving a greasy feel. The final result is a distinctive lotion that you will love the first time it's applied. The proteins, amino acids and lipids in this high-powered bean allow for rapid hydration and skin regeneration. Our hand cream is blended with dried milk solids to keep hands looking and feeling soft and supple. Our irresistibly scented hand treatment is highly recommended as part of your daily skin care routine for beautiful and soft hands. <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/4b4f2e9c-091d-4ca6-928e-cee8edf88107._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br /> <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/8b5e5e72-92c4-4678-97a4-94b9b4dee0c8._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br /> <img alt="""" src=""https://m.media-amazon.com/images/S/aplus-media/vc/905dc348-f67c-484b-80b9-fc0269e8df71._SL300__.jpg"" class=""a-spacing-mini"" />\n \n\n\n <br /> Our Oat Milk Hand Crme is perfect for dry or chapped hands! Gentle Oat Proteins are mixed with Aloe, resulting in one of the best moisturizing formulations on the market. Our Milk Hand Wash is the perfect soap to keep your hands looking healthy and clean. Dried Milk Solids and Natural Soy, Oat, and Rice proteins are combine to create this distinctive hand wash that you will love from the first time you use it. Apply liberally and work into a robust lather and rinse. Gentle enough for everyday use. Begin your daily beauty routine with a gentle, foaming cleanser from one of Archipelago Botanicals aromatherapy bath and body collections. This moisturizing body wash is highly recommended for dry skin. Also available in a larger, 33 oz bottle."


# Flan-T5

## Toy Example with Base Flan-T5

In [19]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

def run_toy_example():
    model_name = "google/flan-t5-large"
    
    # https://github.com/huggingface/transformers/issues/7002
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    
    #quantization_config = BitsAndBytesConfig(
    #    load_in_4bit=True,
    #    bnb_4bit_use_double_quant=False
    #)
    
    #tokenizer = AutoTokenizer.from_pretrained(model_name)
    #model = AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=quantization_config)
    #input_ids = tokenizer(text, return_tensors="pt", padding=True).to(0)
    
    index = 200
    prompt = df_wikidata_train.iloc[[index]]['sent'].astype('string').to_string()
    print(prompt)
    
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True).input_ids
    
    #outputs = model.generate(input_ids)
    #out = model.generate(**input_ids, max_new_tokens=100, do_sample=False)
    outputs = model.generate(input_ids, max_new_tokens=100, do_sample=False)
    
    #print(tokenizer.decode(outputs[0]))
    a = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)
    print(a)

# run_toy_example()

## T5FineTuner Model

In [20]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparam):
        super(T5FineTuner, self).__init__()
        self.num_dataloader_workers = 6 # 6 CPU cores; original code used 2
        self.hparam = hparam

        self.model = T5ForConditionalGeneration.from_pretrained(
            hparam.model_name_or_path)
        self.tokenizer = AutoTokenizer.from_pretrained(
            hparam.model_name_or_path
        )
        self.save_hyperparameters()

        # manual optimization
        self.automatic_optimization = False
    
    def is_logger(self):
        return True

    def forward(
        self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

    # Manual Optimization
    def training_step(self, batch, batch_idx):
        self.log("batch_idx", batch_idx)
        loss = self._step(batch) # compute loss

        self.manual_backward(loss) # manual optimization

        # manual optimization, replaces optimizer_step(...) below
        optimizer = self.optimizers()
        optimizer.step()
        optimizer.zero_grad()
        self.lr_scheduler.step() # learning rate scheduler

        self.log("train_loss",loss)
        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}


    # NotImplementedError: Support for `training_epoch_end` has been removed in v2.0.0.
    # `T5FineTuner` implements this method. You can use the `on_train_epoch_end` hook instead.
    # To access outputs, save them in-memory as instance attributes. You can find migration examples
    # in https://github.com/Lightning-AI/lightning/pull/16520.
    # def training_epoch_end(self, outputs):
    def on_train_epoch_end(self):
        avg_train_loss = torch.stack(self.outputs).mean()
        #avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        
        self.log("avg_train_loss", avg_train_loss)
        tensorboard_logs = {"avg_train_loss": avg_train_loss}

    def on_validation_epoch_start(self) -> None:
        super().on_validation_epoch_start()
        self.outputs = []
        return

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.outputs += loss.unsqueeze(0) # results
        self.log("step_val_loss", loss)
        return {"val_loss": loss}

    def on_validation_epoch_end(self):
        avg_loss = torch.stack(self.outputs).mean()
        #avg_loss = torch.stack([x["val_loss"] for x in output_dict]).mean()
        
        self.log("val_loss",avg_loss)
        tensorboard_logs = {"val_loss": avg_loss}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparam.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        # "AdamW" was deprecated and suggested to use "torch.optim.AdamW" instead
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                          lr=self.hparam.learning_rate, eps=self.hparam.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    # When performing automatic optimization:
    #    Error: The closure hasn't been executed. HINT: did you call `optimizer_closure()` in your `optimizer_step` hook?
    #    It could also happen because the `optimizer.step(optimizer_closure)` call did not execute it internally.
    # See also optimizer closures: https://lightning.ai/docs/pytorch/stable/common/optimization.html#use-closure-for-lbfgs-like-optimizers
    # 
    # Replaced by training_step(...) with manual optimization.
    def optimizer_step(self,
                       epoch=None,
                       batch_idx=None,
                       optimizer=None,
                       optimizer_idx=None,
                       optimizer_closure=None,
                       on_tpu=None,
                       using_native_amp=None,
                       using_lbfgs=None
                       ):

        optimizer.step(closure=optimizer_closure)
        optimizer.zero_grad()
        self.lr_scheduler.step() # learning rate scheduler

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(
            self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(
            tokenizer=self.tokenizer, type_path="train", args=self.hparam)
        dataloader = DataLoader(train_dataset, batch_size=self.hparam.train_batch_size,
                                drop_last=True, shuffle=True, num_workers=self.num_dataloader_workers)
        t_total = (
            (len(dataloader.dataset) //
             (self.hparam.train_batch_size * max(1, self.hparam.n_gpu)))
            // self.hparam.gradient_accumulation_steps
            * float(self.hparam.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparam.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(
            tokenizer=self.tokenizer, type_path="validation", args=self.hparam)
        return DataLoader(val_dataset, batch_size=self.hparam.eval_batch_size, num_workers=self.num_dataloader_workers)

In [21]:
#"/content/drive/My Drive/Colab Notebooks/T5Ner"
# output_dir="" defaults to /content/lightning_logs/version_n/ where n is the run number (0, 1, 2, 3...)
OUTPUT_DIR = "lightning_logs" 

args_dict = dict(
    data_dir="wikiann", # path for data files # unused for Text2KGBench
    output_dir=OUTPUT_DIR, # path to save the checkpoints
    default_root_dir=OUTPUT_DIR, # path to save the checkpoints
    model_name_or_path='google/flan-t5-small',
    tokenizer_name_or_path='google/flan-t5-small', #t5-small
    max_seq_length=256,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=4,
    num_dataloader_workers=1,
    val_check_interval=0.05, # run val/checkpoint after a fixed number of training batches. See https://lightning.ai/docs/pytorch/stable/common/trainer.html#pytorch_lightning.trainer.Trainer.params.val_check_interval
    # check_val_every_n_epoch = None # To deal with streaming data, set this to None and put an int > # training batches in val_check_interval
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False, # https://lightning.ai/docs/pytorch/stable/advanced/speed.html
    fp_16=False, 
    #fp_16=True, # if you want to enable 16-bit training then install apex and set this to true
    max_grad_norm=1,
    #max_grad_norm=0.5, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    seed=42,
)

args = argparse.Namespace(**args_dict)

In [22]:
# to resume training in the middle if interrupted, or to load completed model from checkpoint
def load_model_from_checkpoint(CKPT_PATH, trainer=None):
    model = T5FineTuner.load_from_checkpoint(CKPT_PATH)

    checkpoint = torch.load(CKPT_PATH)

    if trainer:
        # restore from checkpoint/previous training progress
        # See: https://github.com/Lightning-AI/pytorch-lightning/issues/12274
        global_step_offset = checkpoint["global_step"]
        trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset

    # Fix for warning:
    #     You're resuming from a checkpoint that ended before the epoch ended and your dataloader is not resumable. 
    #     This can cause unreliable results if further training is done. Consider using an end-of-epoch checkpoint 
    #     or make your dataloader resumable by implementing the `state_dict` / `load_state_dict` interface.
    # Src: https://github.com/Lightning-AI/pytorch-lightning/issues/2798
    if 'model_state_dict' in checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'])
    elif 'state_dict' in checkpoint:
        model.load_state_dict(checkpoint['state_dict'], strict=False)
    else:
        model.load_state_dict(checkpoint) # Checkpoint contains only model state dict, it's not stored in a dict
        
    if 'lr_scheduler_state_dict' in checkpoint:
        model.lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict'])
        print("Restored lr_scheduler_state_dict from checkpoint")
    if 'optimizer_state_dict' in checkpoint:
        model.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print("Restored optimizer_state_dict from checkpoint")
    
    print("Loaded model from checkpoint:", CKPT_PATH)
    return model

# Fine-Tune Model with Text2KGBench

This code is adapted from "T5 NER Finetuning" provided publicly at https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing and updated to work with pytorch_lightning v2.2.2

"T5 NER Finetuning" says of its model:
"Majority of the code here is adapted from [here](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) which uses the pytorch-lightning framework for training neural networks. T5 has shown that it can generate state of the art on many tasks as long as it can be cast as a text-to-text problem"

### Input Dataset: Tokenize and Preprocess

In [23]:
# T5 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path) #"t5-small")
print(tokenizer)

if TRAINING_DATASET == DatasetOptions.WIKIDATA:
    dataset = df_wikidata_train
elif TRAINING_DATASET == DatasetOptions.DBPEDIA:
    dataset = df_dbpedia_train

#dbpedia_train_dataset = tokenize_dataset(tokenizer=tokenizer, dataset=df_dbpedia_train, type_path='train')
input_dataset = tokenize_dataset(tokenizer=tokenizer, dataset=dataset, type_path="train")
val_dataset = tokenize_dataset(tokenizer=tokenizer, dataset=dataset, type_path='val')

T5TokenizerFast(name_or_path='google/flan-t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>

In [24]:
# below: original code adapted from "T5 NER Finetuning"
# moved into tokenize_data(tokenizer, dataset, type_path)

#input_dataset = CustomDataset(tokenizer=tokenizer, dataset=dataset, type_path='train')
#val_dataset = CustomDataset(tokenizer=tokenizer, dataset=dataset, type_path='val')
#for i in range(len(input_dataset)):
#    _ = input_dataset[i]
#data = input_dataset[0]

#print(tokenizer.decode(data["source_ids"], skip_special_tokens=False))
#print(tokenizer.decode(data["target_ids"], skip_special_tokens=False))

# dict of tensors
print(TRAINING_DATASET)

DatasetOptions.WIKIDATA


In [25]:
# called by LightningModule internally
def get_dataset(tokenizer, type_path, args):
    tokenizer.max_length = args.max_seq_length
    tokenizer.model_max_length = args.max_seq_length
    #if type_path == "validation":
    #    return val_dataset
    #dataset = load_dataset(args.data_dir, "en")
    return input_dataset

## Fine-Tune Model

In [26]:
# !mkdir -p t5_ner

### Set Up Logging

In [58]:
NUM_EPOCHS_TO_SAVE_AFTER = 1

if TRAINING_DATASET == DatasetOptions.WIKIDATA:
    NUM_STEPS_TO_SAVE_AFTER = 37 # divides evenly into df_wikidata_train (37/666 = 18)
elif TRAINING_DATASET == DatasetOptions.DBPEDIA:
    NUM_STEPS_TO_SAVE_AFTER = 37 # divides evenly into df_dbpedia_train (37/666 = 18)

logger = logging.getLogger(__name__)

In [59]:
class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    print("Logging validation results")
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
        
      # Log results
      output_val_results_file = os.path.join(args.output_dir, "val_results.txt")
      with open(output_val_results_file, "a") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    print("Logging testing results")
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
      with open(output_test_results_file, "a") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [60]:
# checkpoints, pl.callbacks.ModelCheckpoint
# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
training_checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir,
    filename="{epoch}-{step}-{train_loss:.2f}", # defaults to None or '{epoch}-{step}'
    monitor="train_loss",
    mode="min",
    save_top_k=-1, #save all of them
    save_on_train_epoch_end=True,
    every_n_train_steps=NUM_STEPS_TO_SAVE_AFTER # checkpoint every n training steps
)
epoch_checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir,
    filename="{epoch}-{step}-{val_loss:.2f}", # defaults to None or '{epoch}-{step}'
    monitor="val_loss",
    mode="min",
    save_top_k=-1, #save all of them
    every_n_epochs=NUM_EPOCHS_TO_SAVE_AFTER, # checkpoint every n epochs
)
logging_callback = LoggingCallback()

# old
#checkpoint_callback = pl.callbacks.ModelCheckpoint(
#    filename=args.output_dir+"/checkpoint.pth",
#    monitor="val_loss",
#    mode="min",
#    save_top_k=5

# TypeError: Trainer.__init__() got an unexpected keyword argument 'gpus'
# TypeError: Trainer.__init__() got an unexpected keyword argument 'checkpoint_callback'
# https://lightning.ai/docs/pytorch/stable/common/trainer.html

# error with dbpedia_train_dataset:
#   self.trainer.num_training_batches == 0
#   `Trainer.fit` stopped: No training batches.

train_params = dict(
    #accumulate_grad_batches=args.gradient_accumulation_steps, # not supported with manual optmization
    #gpus=args.n_gpu,                                          # Trainer: unexpected keyword argument
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,                                # initially commented out
    precision=32,
      # precision='bf16-mixed' if args.fp_16 else 32,
      # `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
      # You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
    #amp_level=args.opt_level,                                 # initially commented out
    #gradient_clip_val=args.max_grad_norm,                     # not supported with manual optimization
    gradient_clip_val=0,
    #checkpoint_callback=checkpoint_callback,                  # Trainer: unexpected keyword argument
    callbacks=[training_checkpoint_callback, epoch_checkpoint_callback, logging_callback],
    #num_sanity_val_steps=0,
      # skip the sanity check and go straight to training
      # removed; was causing the error: Total length of `DataLoader` across ranks is zero. Please make sure this was your intention.
    # accelerator="gpu", devices=1                             # run on 1 gpu - No supported gpu backend found!
    # check_val_every_n_epoch=1/18 # deprecated for val_check_interval between 0 and 1
    #val_check_interval=1
        # set to less than 1, e.g. 1/18 to validate multiple times in an epoch.
        # `Trainer(val_check_interval=1)` was configured so validation will run after every batch.
)

### Load Model and Train

In [61]:
# The checkpoint to load the model from. Set to None to load from scratch.
#  e.g. "lightning_logs/version_26/epoch=1-step=1089-train_loss=0.48.ckpt"
# Model fine-tuned on Wikidata:
#  version 33: CKPT_PATH = "lightning_logs/version_32/epoch=0-step=666-val_loss=0.34.ckpt"
#  version 34: CKPT_PATH = "lightning_logs/version_33/epoch=2-step=1998-val_loss=0.25.ckpt"
#  version 36: CKPT_PATH = "lightning_logs/version_34/epoch=3-step=2072-train_loss=0.35.ckpt"
# Model fine-tuned on DBPedia:
# version 37: (from scratch)

CKPT_PATH = None
#CKPT_PATH = "lightning_logs/version_34/epoch=3-step=2072-train_loss=0.35.ckpt"

In [62]:
print(f"Training initiated for...\n  Model: {args.model_name_or_path}\n  Dataset: {TRAINING_DATASET}\n  Checkpoint: {CKPT_PATH}")

trainer = pl.Trainer(**train_params)

if CKPT_PATH:
    model = load_model_from_checkpoint(CKPT_PATH, trainer)

    # RuntimeError: T5FineTuner is not attached to a `Trainer
    # lightning  model object has no attribute 'lr_scheduler'
    # etc.
    #if model.trainer.lr_schedulers: #config.train.lrScheduler.name == 'StepLR':
    #    model.trainer.lr_schedulers.last_epoch = checkpoint['epoch']
    #    print("Restored lr_scheduler epoch from checkpoint")
    
    trainer.fit(model, ckpt_path=CKPT_PATH)
else:
    model = T5FineTuner(args)
    trainer.fit(model)
    print("Loaded from scratch.")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Training initiated for...
  Model: google/flan-t5-small
  Dataset: DatasetOptions.DBPEDIA
  Checkpoint: None



  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 77.0 M
-----------------------------------------------------
77.0 M    Trainable params
0         Non-trainable params
77.0 M    Total params
307.845   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

Logging validation results


Training: |                                                                                                   …

Validation: |                                                                                                 …

Logging validation results


Validation: |                                                                                                 …

Logging validation results


OSError: [Errno 28] No space left on device

In [41]:
#manually checkpoint the model after training
trainer.save_checkpoint("lightning_logs/final.ckpt")
print("Saved")


# Read checkpoint information
#checkpoint = torch.load(CKPT_PATH)
#global_step_offset = checkpoint["global_step"]
#print(global_step_offset)
#print(checkpoint['state_dict'])

Saved


# (old, moved) Load the Stored Model and Evaluate

In [None]:
#CKPT_PATH = "lightning_logs/version_30/final.ckpt"
checkpoint = torch.load(CKPT_PATH)
print(checkpoint.keys())

In [None]:
model = T5FineTuner.load_from_checkpoint(CKPT_PATH)
print("Done")
#print(model.keys()) #'T5FineTuner' object has no attribute 'keys'

#model.eval() # disable randomness, dropout, etc...
#y_hat = model(x) # predict with the model

In [63]:
#evaluation_dataset = input_dataset

dbpedia_test_dataset = CustomDataset(tokenizer=tokenizer, dataset=df_dbpedia_test, type_path='test')
wikidata_test_dataset = CustomDataset(tokenizer=tokenizer, dataset=df_wikidata_test, type_path='test')
evaluation_dataset = wikidata_test_dataset

In [64]:
import textwrap

dataloader = DataLoader(evaluation_dataset, batch_size=32, num_workers=2, shuffle=True)
model.model.eval()
model = model.to("cpu")
outputs = []
targets = []
texts = []
for batch in dataloader:

    outs = model.model.generate(input_ids=batch['source_ids'],
                                attention_mask=batch['source_mask'])
    dec = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip() for ids in outs]
    target = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
                for ids in batch["target_ids"]]
    text = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
                for ids in batch["source_ids"]]
    texts.extend(text)
    outputs.extend(dec)
    targets.extend(target)
    break

for i in range(10):
    c = texts[i]
    lines = textwrap.wrap("text:\n%s\n" % c, width=100)
    print("\n".join(lines))
    print("\nActual Entities: %s" % target[i])
    print("Predicted Entities: %s" % outputs[i])
    print("=====================================================================\n")


text: "little bad girl" is a single by french dj david guetta, featuring vocals from english
recording artist taio cruz and american rapper ludacris.

Actual Entities: performer: david guetta, performer: ludacris, performer: taio cruz, lyrics by: david guetta, producer: david guetta
Predicted Entities: performer: taio cruz

text: the novel is set during world war ii, with most of the action occurring on or near a fictional
army air forces base in central florida.

Actual Entities: genre: novel, narrative location: florida
Predicted Entities: narrative location: central florida

text: mandubracius or mandubratius was a king of the trinovantes of south-eastern britain in the 1st
century bc.

Actual Entities: ethnic group: trinovantes
Predicted Entities: languages spoken, written or signed: bc

text: tomoyuki yamashita ( , yamashita tomoyuki, 8 november 1885 - 23 february 1946; also called
tomobumi yamashita) was a japanese general of the imperial japanese army during world war ii.

Actua

## Calculate Metrics
The following code calculates metrics for the WikiANN dataset, which I am not using.

In [24]:
def find_sub_list(sl, l):
    results = []
    sll = len(sl)
    for ind in (i for i, e in enumerate(l) if e == sl[0]):
        if l[ind:ind+sll] == sl:
            results.append((ind, ind+sll-1))
    return results

def generate_label(input: str, target: str):
    mapper = {'O': 0, 'B-DATE': 1, 'I-DATE': 2, 'B-PER': 3,
              'I-PER': 4, 'B-ORG': 5, 'I-ORG': 6, 'B-LOC': 7, 'I-LOC': 8}
    inv_mapper = {v: k for k, v in mapper.items()}

    input = input.split(" ")
    target = target.split("; ")

    init_target_label = [mapper['O']]*len(input)

    for ent in target:
        ent = ent.split(": ")
        try:
            sent_end = ent[1].split(" ")
            index = find_sub_list(sent_end, input)
        except:
            continue
        # print(index)
        try:
            init_target_label[index[0][0]] = mapper[f"B-{ent[0].upper()}"]
            for i in range(index[0][0]+1, index[0][1]+1):
                init_target_label[i] = mapper[f"I-{ent[0].upper()}"]
        except:
            continue
    init_target_label = [inv_mapper[j] for j in init_target_label]
    return init_target_label

In [29]:
from tqdm import tqdm

# KeyError: 'rel_label'
test_dataset = CustomDataset(tokenizer=tokenizer, dataset=df_wikidata_test, type_path='test') #WikiAnnDataset(tokenizer=tokenizer, dataset=dataset, type_path='test')
test_loader = DataLoader(test_dataset, batch_size=32,
                             num_workers=2, shuffle=True)
model.model.eval()
#model = model.to("cuda") # cuda not supported on my machine
outputs = []
targets = []
all_text = []
true_labels = []
pred_labels = []
for batch in tqdm(test_loader):
    input_ids = batch['source_ids'] #.to("cuda")
    attention_mask = batch['source_mask'] #.to("cuda")
    outs = model.model.generate(input_ids=input_ids,
                                attention_mask=attention_mask)
    dec = [tokenizer.decode(ids, skip_special_tokens=True,
                            clean_up_tokenization_spaces=False).strip() for ids in outs]
    target = [tokenizer.decode(ids, skip_special_tokens=True,  clean_up_tokenization_spaces=False).strip()
                for ids in batch["target_ids"]]
    texts = [tokenizer.decode(ids, skip_special_tokens=True,  clean_up_tokenization_spaces=False).strip()
                for ids in batch["source_ids"]]
    true_label = [generate_label(texts[i].strip(), target[i].strip()) if target[i].strip() != 'none' else [
        "O"]*len(texts[i].strip().split()) for i in range(len(texts))]
    pred_label = [generate_label(texts[i].strip(), dec[i].strip()) if dec[i].strip() != 'none' else [
        "O"]*len(texts[i].strip().split()) for i in range(len(texts))]

    outputs.extend(dec)
    targets.extend(target)
    true_labels.extend(true_label)
    pred_labels.extend(pred_label)
    all_text.extend(texts)

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "C:\Users\lawfu\AppData\Local\Programs\Python\Python39\lib\multiprocessing\queues.py", line 239, in _feed
    reader_close()
  File "C:\Users\lawfu\AppData\Local\Programs\Python\Python39\lib\multiprocessing\connection.py", line 182, in close
    self._close()
  File "C:\Users\lawfu\AppData\Local\Programs\Python\Python39\lib\multiprocessing\connection.py", line 282, in _close
    _CloseHandle(self._handle)
OSError: [WinError 6] The handle is invalid

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\lawfu\AppData\Local\Programs\Python\Python39\lib\threading.py", line 980, in _bootstrap_inner
    self.run()
  File "C:\Users\lawfu\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "C:\Users\lawfu\AppData\Local\Programs\Python\Python39\lib\threadin

In [31]:
all_text[1]

'a cauldron of witches is a 1988 anthology of 12 fairy tales from around the world that have been collected and retold by ruth manning-sanders.'

In [34]:
from datasets import load_metric

# FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. 
#   Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
# FutureWarning: The repository for seqeval contains custom code which must be executed to correctly 
#   load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.18.0/metrics/seqeval/seqeval.py
#   You can avoid this message in future by passing the argument `trust_remote_code=True`.
#   Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
#metric = load_metric("seqeval")
metric = load_metric("seqeval", trust_remote_code=True)

num_to_check = 10
for i in range(num_to_check):
    print(i)
    print(f"Text:  {all_text[i]}")
    print(f"Predicted Token Class:  {pred_labels[i]}")
    print(f"True Token Class:  {true_labels[i]}")
    print("=====================================================================\n")

print(metric.compute(predictions=pred_labels, references=true_labels))



Using the latest cached version of the module from C:\Users\lawfu\.cache\huggingface\modules\datasets_modules\metrics\seqeval\9642e8a602ba52bd4d8baee1d13b2deb8247d3719041cf02b40bf8367a05aef5 (last modified on Wed Apr 24 13:57:23 2024) since it couldn't be found locally at seqeval, or remotely on the Hugging Face Hub.


0
Text:  pour la suite du monde (also known as for those who will follow; of whales, the moon, and men, or the moontrap in english) is a 1963 canadian documentary film directed by michel brault, marcel carri re and pierre perrault.
Predicted Token Class:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True Token Class:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

1
Text:  a cauldron of witches is a 1988 anthology of 12 fairy tales from around the world that have been collected and retold by ruth manning-sanders.
Predicted Token Class:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True Token Class:  [

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 1.0}
