In [6]:
from functools import partial

import pandas as pd
import spacy
from boltons.iterutils import windowed
from tqdm import tqdm, tqdm_notebook

import torch
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2LMHeadModel, OpenAIAdam
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler

tqdm.pandas()

# Fine-tune the GPT-2 model with weight loss articles

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device);

In [27]:
def sample_text(model, seed='Weight loss can be achieved by ', n_words=500, temperature=1.0):
    model.eval()
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    text = tokenizer.encode(seed)
    inputs, past = torch.tensor([text]), None

    with torch.no_grad():
        for _ in tqdm_notebook(range(n_words), leave=False):
            logits, past = model(inputs.to(device), past=past)

            # TODO: Add temperature for better results
            log_probs = F.softmax(logits[:, -1], dim=-1)
            inputs = torch.multinomial(log_probs, 1)
            
            text.append(inputs.item())

    print(tokenizer.decode(text), end='')

In [12]:
sample_text(model)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Weight loss can be achieved by  offering the shares a fair value . Investment associations introduced this scheme in the 1990s to simplify Securities Licensing very quickly, and I recall them reviewing their version in 2002.

Look who's giving up their comfortable legal cushion to golfer vs 00 Dive tech, try playing in 20's physics mass ellipses , 20's curves etc. realize commanding position in certain golf courses is ridiculous , don't give up the ability to hit your target. Shut four years idea and not do them any favors. they take over only Wednesday running business to get to what you have to do and businesses they're doing what?? Free your energy down the pipeline when you want maximize profits and best off call infact. it all boils down to your knee Agricultural beef is environmentally sensitive because, toxic weed killers and nutrients are also a big risk, livelihood risk and heavy metals material don't go into the lungs or shipfrom containers then the failure risk that banks 

See what the fine-tuning data looks like.

In [13]:
ARTICLES_PATH = 'ezine/health_and_fitness/weight_loss/weight_loss-articles-all.json'
df = pd.read_json(ARTICLES_PATH)[['author', 'text', 'title']]
df.head()

Unnamed: 0,author,text,title
0,Jesse L Moore,"It is almost not possible to watch any TV, rea...",How Obesity is Determined
1,Erica Logan,"If you are reading this article, then I know y...",I Cheated My Way Thin - I Can Now Look At Myse...
2,Acharya Hargreaves,Self hypnosis for weight loss is a very easy p...,Self Hypnosis For Weight Loss is Easy and Rela...
3,Avy Barnes,Are you looking all over for the fastest way t...,Fastest Way to Lose Weight - Melt Away Lbs Of ...
4,Carolyn Anderson,Losing weight is one of the many concerns of m...,How to Lose Weight the Healthy Way - Simple an...


## Preprocess fine-tuning data

In [16]:
flatten = lambda x: [sublst for lst in x for sublst in lst]
to_sentences = lambda text, nlp: [sentence.text for sentence in nlp(text).sents]

class EzineWeightLossDataset(Dataset):
    """Weight loss articles from ezinearticles.com."""
    def __init__(self, data_filename, sequence_length=128):
        nlp = spacy.load('en_core_web_md')
        
        df = pd.read_json(data_filename)[['author', 'text', 'title']]
        
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        
        # TODO: preserve newlines after each paragraph
        # (last sentence needs a newline character at the end)

        # Convert each article into a list of sentences
        # using SpaCy to find sentence boundaries
        articles_sentences = df.iloc[:100].text.progress_apply(  # TODO: remove limit
            partial(to_sentences, nlp=nlp),
        )
        
        # Apply GPT-2 encoding to each sentence.
        # Then, flatten sentences per article so that each article
        # is just a list of token indices
        encoded_articles = [
            flatten([self.tokenizer.encode(sentence) for sentence in article_sentences])
            for article_sentences in tqdm(articles_sentences)
        ]
        
        # Apply a sliding window per article that will be the sequence
        # length fed into the model
        sequences = flatten([
            windowed(encoded_article, sequence_length)
            for encoded_article in encoded_articles
        ])
        
        # Combine all of the sequences into one 2-D matrix.
        # Then, split like [A, B, C, D, E] --> ([A, B, C, D], [B, C, D, E])
        data = torch.tensor(sequences)
        self.inputs_lst, self.targets = data[:-1], data[1:]
        
    def __getitem__(self, i):
        return self.inputs_lst[i], self.targets[i]
    
    def __len__(self):
        return len(self.inputs_lst)

In [17]:
dataset = EzineWeightLossDataset(ARTICLES_PATH)

100%|██████████| 100/100 [00:08<00:00, 11.32it/s]
100%|██████████| 100/100 [00:00<00:00, 208.13it/s]


In [18]:
batch_size = 8
loader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size)

## Fine-tune the GPT-2 model

### Setup

In [19]:
# Code taken from: https://github.com/CQCumbers/grifbot/blob/83e3f9e434f4e2acbb527cb322d621be0aeb646d/scripts/gpt2.py#L61
# TODO: cleanup
n_epochs = 1
learning_rate = 6.25e-10
warmup_proportion = 0.002
max_grad_norm = 0.05
weight_decay = 0.01

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if
        not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if
        any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
n_train_optimization_steps = len(dataset) * n_epochs // batch_size
optimizer = OpenAIAdam(
    optimizer_grouped_parameters,
    lr=learning_rate,
    warmup=warmup_proportion,
    max_grad_norm=max_grad_norm,
    weight_decay=weight_decay,
    t_total=n_train_optimization_steps,
)

### Train

In [21]:
# Code taken from: https://github.com/CQCumbers/grifbot/blob/83e3f9e434f4e2acbb527cb322d621be0aeb646d/scripts/gpt2.py#L77
# TODO: cleanup
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
model.train()
for _ in tqdm_notebook(range(n_epochs)):
    tr_loss = 0
    nb_tr_steps = 0
    tqdm_bar = tqdm_notebook(loader, desc='Training')
    for step, batch in enumerate(tqdm_bar):
        input_ids, lm_labels = tuple(t.to(device) for t in batch)
        loss = model(input_ids, lm_labels=lm_labels)
        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        exp_average_loss = (
            loss.item() if exp_average_loss is None
            else 0.7 * exp_average_loss + 0.3 * loss.item()
        )
        nb_tr_steps += 1
        tqdm_bar.desc = f'Training loss: {exp_average_loss:.2e} lr: {optimizer.get_lr()[0]:.2e}'

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Training', max=5552, style=ProgressStyle(description_width='i…




In [26]:
original_model = GPT2LMHeadModel.from_pretrained('gpt2')
original_model.to(device);

In [47]:
# Sample text from the original model
sample_text(original_model)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

Weight loss can be achieved by eringing diets and changes in caloric intake, although approximately one-third of the diet-moderator carbohydrate-supplemented humans who are subjects of frequent sudden cardiac arrest do not achieve the full 16% to 17% of sustained ketide-induced insulin resistance.22 , 23 , 24 Consumption of fatty acids in decreasing percentages of weight and body strength has been associated with decreased protein and fat composition during recovery, but no a priori evidence of detrimental effects on muscle metabolism and dysfunction.25 The author noted that it may be therefore necessary to program for preterm that when intensity is within a normal range, lean body mass will degrade. In this theory, bold predictive nutrition bias should be introduced to adjust protein-energy composition above the lower bound of 1.8 grams of protein and lower or not at all to calories available at low glycogen concentrations; using additions merely to compensate for weight loss (and ot

In [48]:
# Sample text from the fine-tuned model
sample_text(model)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

Weight loss can be achieved by  homebrewing and homebrewing preserve many homebrew recipes into small biological products during long-term brewing process. When a multidimensional using kitchen has used this method for esamazing drum rocks and better mixing patterns, it depends on internally and externally derived laws. The resulting aromas from the malt extract and reducing potency of maltstouts varies depending around the humidity limit or play current time, best physical interpretation, mimicking the olfactory profile of nitroglycerin and lactose developed with reference to the state & fishery population in bulk. Analysis of proteins containing this compound on array can explain results, indicating energy property of grains in effect with 3, 5 & 10% slurry off balance. The experts say folic acid and baking soda. Certain sugar extractors can cut down on salinity due to maturation time coefficients (instead of forming natural pH in natural intermediate water). Lactose distillation be

### Save the fine-tuned model

In [23]:
torch.save(model.state_dict(), 'finetuned_gpt2.pkl')