In [2]:
from keytotext import pipeline
import keybert

import os
import re
import time
import typing
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 42


The dataset corpus used is based on [Wikipedia business corpus](https://gricad-gitlab.univ-grenoble-alpes.fr/getalp/wikipediacompanycorpus/-/tree/master/). There are two approaches used here: either use `KeyBert` to generate keywords of the abstract text, or use the dataset's `infobox` to generate the keywords.

This notebook assumes the wikipedia business corpus has been saved into `data/wikiepedia_companies`.

## Using KeyBert to generate keywords

In [29]:
kw_model = keybert.KeyBERT("all-mpnet-base-v2")

Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 2.18MB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 135kB/s]
Downloading: 100%|██████████| 10.6k/10.6k [00:00<00:00, 10.7MB/s]
Downloading: 100%|██████████| 571/571 [00:00<00:00, 287kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 53.4kB/s]
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 110kB/s] 
Downloading: 100%|██████████| 438M/438M [00:33<00:00, 13.2MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 64.9kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 130kB/s]
Downloading: 100%|██████████| 466k/466k [00:02<00:00, 161kB/s]  
Downloading: 100%|██████████| 363/363 [00:00<00:00, 193kB/s]
Downloading: 100%|██████████| 13.1k/13.1k [00:00<00:00, 4.67MB/s]
Downloading: 100%|██████████| 232k/232k [00:05<00:00, 41.0kB/s] 
Downloading: 100%|██████████| 349/349 [00:00<00:00, 122kB/s]


In [2]:
with open('data/wikipedia_companies/train_abstract.txt', 'r') as f:
    train = f.readlines()

In [3]:
train[0]

'cmc connect burson-marsteller is a premier perception management firm that provides communication solutions . the company was founded in 1995 by yomi badejo okusanya in ikeja , lagos , nigeria . cmc connect have exclusive affiliation with burson-marsteller a leading global public relations firm and became cmc connect burson-marsteller in 2015 .\n'

In [105]:
i = 150
n = 8
n_gram = (1,2)

with_mmr = kw_model.extract_keywords(
    train[i], 
    keyphrase_ngram_range=n_gram, 
    top_n=8, 
    stop_words="english", 
    highlight=True,
    use_mmr=True,
    diversity=0.7
)

with_maxsum = kw_model.extract_keywords(
    train[i], 
    keyphrase_ngram_range=n_gram, 
    top_n=5, 
    stop_words="english", 
    highlight=True,
    use_maxsum=True,
)

In [106]:
with_mmr, with_maxsum

([('tompkins trust', 0.7036),
  ('york', 0.3764),
  ('bank traces', 0.3091),
  ('1864 reorganized', 0.1689),
  ('state', 0.1163),
  ('services', 0.1083),
  ('parent', 0.0809),
  ('act new', 0.0577)],
 [('banks insurance', 0.3737),
  ('york', 0.3764),
  ('based ithaca', 0.4014),
  ('financial corporation', 0.4192),
  ('parent tompkins', 0.48)])

Estimate time taken to extract keywords from the dataset in order to determine the appropriate parameters to use for the entire dataset.

In [82]:


def estimate_time_taken(params: typing.Dict) -> pd.Series:
    num_rows = []
    time_taken = []
    for n in [5, 6, 7, 8]:
        for i in [32, 64, 128, 256, 512]:
            ns.append(n)
            num_rows.append(i)
            start = time.time()
            kw_model.extract_keywords(
                train[:i], 
                keyphrase_ngram_range=(1, 2), 
                top_n=n, 
                stop_words="english", 
                highlight=True,
                **params
            )
            time_taken.append(round(time.time() - start, 2))
            print(f"top_n = {n}, batch_size = {i}, took {time_taken[-1]}s")
    
    # prepare multiindex
    tuples = list(zip(ns, num_rows))
    index = pd.MultiIndex.from_tuples(tuples, names=["top_n", "batch_size"])
    return pd.Series(time_taken, index=index)


In [83]:
with_mmr = estimate_time_taken({"use_mmr": True, "diversity": 0.6})
with_mmr

top_n = 5, batch_size = 32, took 0.51s
top_n = 5, batch_size = 64, took 0.82s
top_n = 5, batch_size = 128, took 1.89s
top_n = 5, batch_size = 256, took 2.23s
top_n = 5, batch_size = 512, took 4.86s
top_n = 6, batch_size = 32, took 0.34s
top_n = 6, batch_size = 64, took 0.83s
top_n = 6, batch_size = 128, took 1.43s
top_n = 6, batch_size = 256, took 1.98s
top_n = 6, batch_size = 512, took 4.85s
top_n = 7, batch_size = 32, took 0.35s
top_n = 7, batch_size = 64, took 1.19s
top_n = 7, batch_size = 128, took 1.2s
top_n = 7, batch_size = 256, took 1.94s
top_n = 7, batch_size = 512, took 5.53s
top_n = 8, batch_size = 32, took 0.39s
top_n = 8, batch_size = 64, took 0.69s
top_n = 8, batch_size = 128, took 1.1s
top_n = 8, batch_size = 256, took 2.06s
top_n = 8, batch_size = 512, took 5.28s


top_n  batch_size
5      32            0.51
       64            0.82
       128           1.89
       256           2.23
       512           4.86
6      32            0.34
       64            0.83
       128           1.43
       256           1.98
       512           4.85
7      32            0.35
       64            1.19
       128           1.20
       256           1.94
       512           5.53
8      32            0.39
       64            0.69
       128           1.10
       256           2.06
       512           5.28
dtype: float64

In [None]:
with_maxsum = estimate_time_taken({'use_maxsum': True, "nr_candidates": 5})
with_maxsum

## Using infobox and some preprocessing
We will use the infobox dataset information to extract the relevant keywords, whilst ignoring other properties.

In [2]:
with open("data/wikipedia_companies/train_infobox.txt", 'r') as f:
    infobox = f.readlines()

In [5]:
infobox[2]

'name1[ ic group ], headquarters1[ copenhagen denmark ], founded1[ 2001 ], industry1[ fashion ], type1[ company ], key people1[ mads ryder (ceo) ], products1[ clothing ]\n'

In [8]:

exclude = ('found', 'key people', 'number', 'owner', 'defunct')

def to_include(row: str):
    return not row.strip().startswith(exclude)

def extract(s: str):
    # extracts the keywords between [ ]
    # then strips the keywords of whitespaces
    t = re.search(r'(?<=\[).+?(?=\])', s)
    return s[t.start() + 1: t.end() - 1]

def parse_string(s: str):
    # 1. split each company (item within infobox) into a list of its properties
    props = s.split(',')

    # 2. remove certain properties
    props = list(filter(to_include, props))

    # 3. use regex to extrac the keyword from within each property
    keywords = map(extract, props)
    return list(keywords)

In [7]:
infobox[3], parse_string(infobox[3])

('name1[ bce inc. ], headquarters1[ canada ], founded1[ 1983 ], industry1[ telecommunications ], industry2[ mass media ], key people1[ george cope (ceo) ], products1[ fixed line and ], products2[ mobile telephony ], products3[ internet services ], founder1[ charles fleetford sise ]\n',
 ['bce inc.',
  'canada',
  'telecommunications',
  'mass media',
  'fixed line and',
  'mobile telephony',
  'internet services'])

In [4]:
def get_keywords_from_infobox(infobox: typing.List[str], batch_size: int = 512):
    for i in range(0, len(infobox), batch_size):
        yield list(map(parse_string, infobox[i: i + batch_size]))

In [5]:
infobox_keywords = list(map(parse_string, infobox))
len(infobox_keywords)

35384

In [6]:
infobox_keywords[0]

['cmc connect burson-marsteller',
 'ikeja lagos lagos nigeria',
 'media',
 'perception management',
 'public relations']

In [12]:
def create_csv(keywords, texts, filename: str):
    keywords_str = [" ".join(keyword_list).replace(",", "") for keyword_list in keywords]
    df = pd.DataFrame({"keywords": keywords_str, "text": texts})

    dest_path = os.path.join('data', 'finetune', filename)
    df.to_csv(dest_path, index=False)

def make_datasets():
    for dataset in ['train', 'dev', 'test']:
        infobox_fn = f"{dataset}_infobox.txt"
        abstract = f"{dataset}_abstract.txt"

        infobox_path = os.path.join("data", "wikipedia_companies", infobox_fn)
        body_path = os.path.join("data", "wikipedia_companies", abstract)

        with open(infobox_path, 'r') as info_f:
            infobox = info_f.readlines()
        
        with open(body_path, 'r') as body_f:
            body = body_f.readlines()
        
        infobox_keywords = list(map(parse_string, infobox))
        create_csv(infobox_keywords, body, f"{dataset}.csv")

In [24]:
def sanity_check(directory_of_csvs: str):
    import glob

    csvs = glob.glob(f"{directory_of_csvs}/*")
    for file in csvs:
        df = pd.read_csv(file)
        dataset = file.rsplit('/', 1)[-1].split('.')[0]

        print(f"{dataset} has {len(df)} rows.")
        print(f"Empty or null values: {df.isnull().value_counts()}")


In [13]:
make_datasets()

In [25]:
sanity_check("data/finetune/")

train has 35384 rows.
Empty or null values: keywords  text 
False     False    35384
dtype: int64
test has 4368 rows.
Empty or null values: keywords  text 
False     False    4368
dtype: int64
dev has 3929 rows.
Empty or null values: keywords  text 
False     False    3929
dtype: int64


In [14]:
df = pd.read_csv('data/finetune/test.csv')
df.head()

Unnamed: 0,keywords,text
0,new kolb aircraft aerospace company kit aircraft,the new kolb aircraft company is an american a...
1,kum & go retail convenience stores,"kum may refer to : kum , a serbian and ukraini..."
2,canadian tire financial services oakville fina...,"canadian tire financial services ltd. , doing ..."
3,bronner bros. marietta georgia private african...,the bronner bros. enterprise is one of the lar...
4,rgb entertainment argentina television product...,rgb entertainment is a production company from...


# Finetuning

In [None]:
!wandb login

In [3]:
os.environ["WANDB_API_KEY"]=""

In [4]:
train_df = pd.read_csv("data/finetune/train.csv")
dev_df = pd.read_csv("data/finetune/dev.csv")

In [5]:
from keytotext import trainer

In [6]:
model = trainer()
model.from_pretrained("t5-base")
model.train(
    train_df, 
    dev_df, 
    batch_size=4, 
    max_epochs=5,
    use_gpu=True
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mashrielbrian[0m. Use [1m`wandb login --relogin`[0m to force relogin


  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 4: 100%|██████████| 9829/9829 [38:59<00:00,  4.20it/s, loss=1.92, v_num=rrtt, train_loss=1.740, val_loss=2.010]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 9829/9829 [39:02<00:00,  4.20it/s, loss=1.92, v_num=rrtt, train_loss=1.740, val_loss=2.010]


In [35]:
model.save_model()

In [37]:
from huggingface_hub import Repository

# requires git-lfs
# on ubuntu, to install git lfs:
# 1. curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
# 2. sudo apt-get install git-lfs
# 3. git-lfs install 

token = "" # huggingface token
model_repo = Repository(
    "model",
    "ashrielbrian/t5-base-wikipedia-companies-keywords",
    token=token,
    git_user="ashrielbrian"
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

/home/brian/Documents/keytotext/model is already a clone of https://huggingface.co/ashrielbrian/t5-base-wikipedia-companies-keywords. Make sure you pull the latest changes with `repo.git_pull()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [39]:
from pathlib import Path


model_name = "t5-base-wikipedia-companies-keywords"
readme_txt = f"""
        ---
            language: "en"
            thumbnail: "Keywords to Sentences"
            tags:
            - keytotext
            - k2t
            - Keywords to Sentences

            model-index:
            - name: {model_name}
            ---

            Idea is to build a model which will take keywords as inputs and generate sentences as outputs.

            Potential use case can include: 
            - Marketing 
            - Search Engine Optimization
            - Topic generation etc.
            - Fine tuning of topic modeling models 
        """.strip()

(Path(model_repo.local_dir) / "README.md").write_text(readme_txt)


579

In [40]:
model_repo.push_to_hub()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Upload file pytorch_model.bin: 100%|█████████▉| 847M/850M [02:33<00:00, 6.43MB/s] remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
remote: ----------------------------------------------------------[0;33m        
remote: help: https://huggingface.co/docs/hub/model-cards#model-card-metadata[0;32m        
remote: ----------------------------------------------------------        
remote: Please find the documentation at:        
remote: https://huggingface.co/docs/hub/model-cards#model-card-metadata[0;0m        
remote: ----------------------------------------------------------        
To https://huggingface.co/ashrielbrian/t5-base-wikipedia-companies-keywords
   e51b24c..2274232  main -> main

Upload file pytorch_model.bin: 100%|██████████| 850M/850M [02:39<00:00, 5.59MB/s]
Upload file spiece.model: 100%|██████████| 773k/773k [02:38<00:00, 4.78kB/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'https://huggingface.co/ashrielbrian/t5-base-wikipedia-companies-keywords/commit/2274232cf9e44e4ed98304e4b0f80f7b84e07bc5'

In [50]:
model.load_model(model_dir="model", use_gpu=True)

In [57]:
model.predict(keywords=["band and roll", "german", "leather goods", "iphone", "ipad"], use_gpu=True)

'band and roll is a german leather goods company founded in 1912 by gerald h. band. the company was originally known as band and roll, but changed its name to band and roll in 1989.'

In [58]:
# To download and load the model from HF Hub
model_repo.clone_from(f"ashrielbrian/{model_name}", token=token)
loaded_model = trainer()
loaded_model.load_model('model') # assuming `model_repo` is set to `model/` directory


[0;31mSignature:[0m
[0mmodel_repo[0m[0;34m.[0m[0mclone_from[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mrepo_url[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtoken[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Clone from a remote. If the folder already exists, will try to clone the
repository within it.

If this folder is a git repository with linked history, will try to
update the repository.

Args:
    repo_url (`str`):
        The URL from which to clone the repository
    token (`Union[str, bool]`, *optional*):
        Whether to use the authentication token. It can be:
         - a string which is the token itself
         - `False`, which would not use the authentication token
         - `True`, which would fetch the authentication token from the
      