In [9]:
!pip install rouge_score



In [None]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import transformers
from transformers import BertTokenizerFast, RobertaTokenizerFast, TFEncoderDecoderModel, AdamWeightDecay
from sklearn.model_selection import train_test_split
import datasets
from tqdm.notebook import tqdm
from tensorflow.python.ops.numpy_ops import np_config
from pprint import pprint



# Data Preparation

In [11]:
df_reviews = pd.read_csv('gigaword_dataset.csv')
# df_reviews
df_reviews.rename(columns = {'document':'Text',
                        'summary':'Summary'  }, inplace = True) 
df_reviews.head(2)

Unnamed: 0,Text,Summary
3240044,a dozen of cambodian journalists and governmen...,cambodia marks world press freedom day
2374379,"with almost ###,### hurricane katrina evacuees...",effort to house katrina evacuees fraught with ...


Drop the duplicated reviews and reviews without summaries

In [12]:
df_reviews.drop_duplicates(subset=['Text'], inplace=True)
# print(df_reviews[df_reviews['Summary'].isnull() == True]['Text'].unique())
df_reviews.dropna(subset=['Summary'], inplace=True)
df_reviews.reset_index(inplace=True)

# Training Preparation

In [15]:
# Configure the training parameters
class TrainingConfig:
    val_split = 0.2
    pretrained_checkpoint = 'bert-base-uncased'
    encoder_checkpoint = 'bert-base-uncased'
    decoder_checkpoint = 'bert-base-uncased'
    pad_token_id = 0
    shared_weight = False
    encoder_max_len = 256 
    decoder_max_len = 30 
    nb_epoch = 3 
    learning_rate = 3e-5 
    batch_size = 8 
    
    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            setattr(self, k, v)

In [16]:
# load the train and validation dataset
class DataLoader:
    def __init__(self, paragraphs, summaries, **kwargs):
        self.paragraphs = paragraphs 
        self.summaries = summaries 
        self.tokenizer = kwargs.get('tokenizer')
        self.val_split = kwargs.get('val_split')
        self.encoder_max_len = kwargs.get('encoder_max_len')
        self.decoder_max_len = kwargs.get('decoder_max_len')
    
    @property
    def sample_size(self):
        assert len(self.paragraphs)==len(self.summaries)
        return len(self.paragraphs)
    
    def split_train_test(self):
        train_idx, val_idx = train_test_split(
            list(range(self.sample_size)), 
            test_size=self.val_split, 
            random_state=98
        )
        return train_idx, val_idx
    
    def convert_text_to_ids(self, input_paragraphs, input_summaries):
        inputs = self.tokenizer(
            list(input_paragraphs), 
            return_tensors='np', 
            padding='max_length', 
            truncation=True, 
            max_length=self.encoder_max_len
        )
        outputs = self.tokenizer(
            list(input_summaries), 
            return_tensors='np', 
            padding='max_length', 
            truncation=True, 
            max_length=self.decoder_max_len
        )
        return inputs, outputs
    
    def list_to_tensor_dataset(self, input_paragraphs, input_summaries):
        inputs, outputs = self.convert_text_to_ids(
            input_paragraphs, 
            input_summaries
        )
        input_ids = tf.data.Dataset.from_tensor_slices(
            inputs['input_ids']
        )
        attention_masks = tf.data.Dataset.from_tensor_slices(
            inputs['attention_mask']
        )
        output_ids = tf.data.Dataset.from_tensor_slices(
            outputs['input_ids']
        )
        output_attention_masks = tf.data.Dataset.from_tensor_slices(
            outputs['attention_mask']
        )                                                
        tf_dataset = tf.data.Dataset.zip(
            ({
                'input_ids': input_ids, 
                'attention_mask': attention_masks,
                'decoder_input_ids': output_ids, 
                'decoder_attention_mask': output_attention_masks
            }, 
            output_ids)
        )
        return tf_dataset
    
    def __call__(self):
        train_idx, val_idx = self.split_train_test()
        train_paras, val_paras = self.paragraphs[train_idx], self.paragraphs[val_idx]
        train_sums, val_sums = self.summaries[train_idx], self.summaries[val_idx]
        train_dataset = self.list_to_tensor_dataset(train_paras, train_sums)
        val_dataset = self.list_to_tensor_dataset(val_paras, val_sums)
        return train_dataset, val_dataset

# Model Setup

In [17]:
# Customized loss function for seq2seq model
class Seq2SeqLoss(tf.keras.losses.Loss):
    def __init__(self, pad_token_id, name="seq2seq_loss"):
        super().__init__(name=name)
        self.pad_token_id = pad_token_id

    def call(self, y_true, y_pred):
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, 
            reduction=tf.keras.losses.Reduction.NONE
        )
        # shift the label and output sequences to match  
        output_logits = y_pred[:,:-1,:]
        input_labels = y_true[:,1:] 
        loss = loss_fn(input_labels, output_logits)
        # calculate loss without the padding tokens in label sequence
        mask = tf.cast((input_labels != self.pad_token_id), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [18]:
class Trainer:
    def __init__(self, model, loss_fn, optimizer, metric):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.metric = metric
        # loss tracker will capture the mean of loss till now
        self.loss_tracker = tf.keras.metrics.Mean(name='mean_loss')
    
    # Training Step
    @tf.function 
    def train_step(self, inputs):
        input_seqs, input_labels = inputs
        with tf.GradientTape() as tape: 
            outputs = self.model(
                input_seqs['input_ids'],
                input_seqs['attention_mask'],
                input_seqs['decoder_input_ids'],
                input_seqs['decoder_attention_mask'],
                training = True
            )
            logits = outputs.logits
            loss = self.loss_fn(input_labels, logits)
        gradients = tape.gradient(loss, self.model.trainable_weights)
        self.optimizer.apply_gradients(
            zip(gradients, self.model.trainable_weights)
        )
        self.loss_tracker.update_state(loss)
#         self.metric.update_state(y, predictions)
        return loss
        
    # Validation Step
    @tf.function  
    def val_step(self, inputs):
        input_seqs, input_labels = inputs
        outputs = self.model(                
                input_seqs['input_ids'],
                input_seqs['attention_mask'],
                input_seqs['decoder_input_ids'],
                input_seqs['decoder_attention_mask'],
                training = False
        )
        logits = outputs.logits
        loss = self.loss_fn(input_labels, logits)
        self.loss_tracker.update_state(loss)
#         self.metric.update_state(y,predictions)
        return loss

In [19]:
def batched_generate_summary(model, tokenizer, batched_input):
    input_seqs, input_labels = batched_input
    outputs = model.generate(
        input_ids=input_seqs['input_ids'], 
        attention_mask=input_seqs['attention_mask']
    )
    output_strs = tokenizer.batch_decode(
        outputs, 
        skip_special_tokens=True
    )
    output_gold = tokenizer.batch_decode(
        input_seqs['decoder_input_ids'], 
        skip_special_tokens=True
    )
    input_strs = tokenizer.batch_decode(
        input_seqs['input_ids'], 
        skip_special_tokens=True
    )
    return output_strs, output_gold, input_strs

# Bert2Bert

In [21]:
reviews = df_reviews['Text'].values
summaries = df_reviews['Summary'].values

training_config = TrainingConfig(nb_epoch=5)
tokenizer = BertTokenizerFast.from_pretrained(training_config.encoder_checkpoint)

dataloader_args = {
    'tokenizer': tokenizer,
    'val_split': training_config.val_split,
    'encoder_max_len': training_config.encoder_max_len,
    'decoder_max_len': training_config.decoder_max_len
}
dataloader = DataLoader(reviews, summaries, **dataloader_args)
train_dataset, val_dataset = dataloader()
train_dataset = (train_dataset
                 .shuffle(int(dataloader.sample_size*(1-training_config.val_split)))
                 .batch(training_config.batch_size))
val_dataset = val_dataset.batch(training_config.batch_size)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [22]:
bert2bert = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
    training_config.encoder_checkpoint, 
    training_config.decoder_checkpoint,
    # whether to share the encoder weight
    tie_encoder_decoder=training_config.shared_weight
)

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
All model checkpoint layers were used when initializing TFBertLMHeadModel.

Some layers of TFBertLMHeadModel were not initialized from the model checkpoint at bert

In [23]:
bert2bert.save_pretrained('bert2bert')
bert2bert = TFEncoderDecoderModel.from_pretrained('bert2bert')

All model checkpoint layers were used when initializing TFEncoderDecoderModel.

All the layers of TFEncoderDecoderModel were initialized from the model checkpoint at bert2bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFEncoderDecoderModel for predictions without further training.


In [24]:
# The special tokens for decoder should be aligned with the special tokens for encoder
# Since we are using Bert checkpoint for both decoder and decoder, 
# the cls and sep tokens in the encoder could be used as the start and end token for the decoder
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id # 101
bert2bert.config.eos_token_id = tokenizer.sep_token_id # 102 
bert2bert.config.pad_token_id = tokenizer.pad_token_id # 0
bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size 

# These configurations are for the beam search in decoding process
bert2bert.config.max_length = 30
bert2bert.config.min_length = 3
bert2bert.config.no_repeat_ngram_size = 2
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

In [37]:
rouge = datasets.load_metric('rouge')
tf.keras.backend.clear_session()
trainer = Trainer(model=bert2bert,
                  loss_fn=Seq2SeqLoss(training_config.pad_token_id),
                  optimizer=AdamWeightDecay(
                      learning_rate=training_config.learning_rate, 
                      weight_decay_rate=0.005
                  ),
                  metric=None)

# Training Loop
for epoch in range(training_config.nb_epoch):
    print(f'\nEpoch {epoch+1}\n')
    print('Training....')
    for step,batched_input in enumerate(tqdm(train_dataset)):
        loss = trainer.train_step(batched_input)
        till_now_loss = trainer.loss_tracker.result()
        if step%200 == 0:
            print(f'Training loss for one batch at step {step}: {round(till_now_loss,3)}') 
    trainer.loss_tracker.reset_states()
    
    print('Validating....')
    val_measures = {'rouge precision':0, 'rouge recall':0, 'rouge f1': 0}
    for step, batched_input in enumerate(tqdm(val_dataset)):
        val_loss = trainer.val_step(batched_input)
#         pred_str, gold_str = generate_summary(bert2bert, 
#                                               tokenizer, 
#                                               batched_input)
#         rouge_output = rouge.compute(predictions=pred_str,
#                                      references=gold_str,
#                                      rouge_types=['rouge2'])['rouge2'].mid
#         val_measures['rouge precision'] += rouge_output.precision / len(val_dataset)
#         val_measures['rouge recall'] += rouge_output.recall / len(val_dataset)
#         val_measures['rouge f1'] += rouge_output.fmeasure / len(val_dataset)
    till_now_val_loss = trainer.loss_tracker.result()
    print(f'Validation loss: {round(till_now_val_loss,3)}')
    bert2bert.save_pretrained(
        f'/kaggle/working/bert2bert-Checkpoint-epoch{epoch+1}-loss{round(till_now_val_loss,3)}'
    )
#     for name, value in val_measures.items():
#         print(f'Validation {name}: {value}')
    trainer.loss_tracker.reset_states()


Epoch 1

Training....


  0%|          | 0/999 [00:00<?, ?it/s]

Training loss for one batch at step 0: 1.6540000438690186
Training loss for one batch at step 200: 1.5099999904632568
Training loss for one batch at step 400: 1.5520000457763672
Training loss for one batch at step 600: 1.5759999752044678
Training loss for one batch at step 800: 1.5950000286102295
Validating....


  0%|          | 0/250 [00:00<?, ?it/s]

Validation loss: 4.560999870300293

Epoch 2

Training....


  0%|          | 0/999 [00:00<?, ?it/s]

Training loss for one batch at step 0: 1.3009999990463257
Training loss for one batch at step 200: 1.090999960899353
Training loss for one batch at step 400: 1.1230000257492065
Training loss for one batch at step 600: 1.1369999647140503
Training loss for one batch at step 800: 1.1510000228881836
Validating....


  0%|          | 0/250 [00:00<?, ?it/s]

Validation loss: 4.710000038146973

Epoch 3

Training....


  0%|          | 0/999 [00:00<?, ?it/s]

Training loss for one batch at step 0: 0.8040000200271606
Training loss for one batch at step 200: 0.7990000247955322
Training loss for one batch at step 400: 0.8019999861717224
Training loss for one batch at step 600: 0.8209999799728394
Training loss for one batch at step 800: 0.8320000171661377
Validating....


  0%|          | 0/250 [00:00<?, ?it/s]

Validation loss: 4.800000190734863

Epoch 4

Training....


  0%|          | 0/999 [00:00<?, ?it/s]

Training loss for one batch at step 0: 0.6850000023841858
Training loss for one batch at step 200: 0.574999988079071
Training loss for one batch at step 400: 0.578000009059906
Training loss for one batch at step 600: 0.593999981880188
Training loss for one batch at step 800: 0.609000027179718
Validating....


  0%|          | 0/250 [00:00<?, ?it/s]

Validation loss: 4.9120001792907715

Epoch 5

Training....


  0%|          | 0/999 [00:00<?, ?it/s]

Training loss for one batch at step 0: 0.4869999885559082
Training loss for one batch at step 200: 0.43299999833106995
Training loss for one batch at step 400: 0.44600000977516174
Training loss for one batch at step 600: 0.4519999921321869
Training loss for one batch at step 800: 0.4620000123977661
Validating....


  0%|          | 0/250 [00:00<?, ?it/s]

Validation loss: 5.019999980926514


In [39]:
# Load the best model checkpoint
trained_bert2bert = TFEncoderDecoderModel.from_pretrained(
    'bert2bert-Checkpoint-epoch3-loss4.329999923706055'
)

All model checkpoint layers were used when initializing TFEncoderDecoderModel.

All the layers of TFEncoderDecoderModel were initialized from the model checkpoint at /kaggle/working/bert2bert-Checkpoint-epoch3-loss4.329999923706055.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFEncoderDecoderModel for predictions without further training.


Let's see what is our generated output look like!

In [40]:
# val0 = list(val_dataset.as_numpy_iterator())[0]
for step, batched_input in enumerate(tqdm(val_dataset)):
    pred_str, gold_str, input_strs = batched_generate_summary(
        trained_bert2bert, 
        tokenizer, 
        batched_input
    )
    rouge_output = rouge.compute(
        predictions=pred_str,
        references=gold_str,
        rouge_types=["rouge1"]
    )
    print('Rouge report: ')
    print(rouge_output['rouge1'].mid)
    for p_str,g_str,in_str in zip(pred_str, gold_str, input_strs):
        print('='*100)
        print('Review: ' + in_str)
        print('Summary: ' + g_str)
        print('Generated: ' + p_str)
    
    break

  0%|          | 0/250 [00:00<?, ?it/s]

Rouge report: 
Score(precision=0.15556908369408368, recall=0.1751488095238095, fmeasure=0.15963069438301636)
Review: a foreign business executive participating in the ongoing # # # # taiwan business alliance conference said monday he hopes the two sides of the taiwan strait can establish direct transportation links as soon as possible.
Summary: foreign executives advocate direct cross - strait transport links
Generated: foreign trade council says taiwan will cooperate with taiwan
Review: two companies listed on the dar es salaam stock exchange - lrb - dse - rrb - have announced their respective plans of dividend payment to be made next month.
Summary: tanzanian listed companies announce dividend payment plans
Generated: sala to issue new subscribe - subsem rates
Review: defense secretary donald h. rumsfeld spent tuesday in a whirlwind trip around iraq that included meetings with american troops outside the capital, meetings with government officials in baghdad, and a final stop here, a

In [49]:
df_reviews_test = df_reviews.copy()
df_reviews_test = df_reviews_test.iloc[1000:1100, :]
test_reviews = df_reviews_test['Text'].values
test_sums = df_reviews_test['Summary'].values

test_dataloader = DataLoader(test_reviews, test_sums, **dataloader_args)
test_dataset = test_dataloader.list_to_tensor_dataset(test_reviews, test_sums)
test_dataset = test_dataset.batch(training_config.batch_size)
pred_strs = []
gold_strs = []

for batched_input in tqdm(test_dataset):
    pred_str, gold_str, _ = batched_generate_summary(
        trained_bert2bert, 
        tokenizer, 
        batched_input
    )
    pred_strs.extend(pred_str)
    gold_strs.extend(gold_str)
    
rouge_output = rouge.compute(
    predictions=pred_strs,
    references=gold_strs,
    rouge_types=["rouge1"]
)
pprint(rouge_output)

  0%|          | 0/13 [00:00<?, ?it/s]

{'rouge1': AggregateScore(low=Score(precision=0.3303154761904761, recall=0.3006857548701298, fmeasure=0.31052319277686524), mid=Score(precision=0.3735716089466089, recall=0.34221338383838384, fmeasure=0.3499700360011505), high=Score(precision=0.4189004329004327, recall=0.38829684343434345, fmeasure=0.391930746404438))}


In [None]:
# Save the trained_bert2bert model
trained_bert2bert.save_pretrained('trained_bert2bert.h5')
trained_bert2bert.save_pretrained('trained_bert2bert.json')
# Save the tokenizer



In [None]:
tokenizer.save_pretrained('tokenizer/')

In [53]:
!zip -r trained_bert2bert_h5.zip /trained_bert2bert.h5
!zip -r trained_bert2bert_json.zip /trained_bert2bert.json




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: kaggle/working/trained_bert2bert.h5/ (stored 0%)
  adding: kaggle/working/trained_bert2bert.h5/tf_model.h5 (deflated 8%)
  adding: kaggle/working/trained_bert2bert.h5/config.json (deflated 80%)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: kaggle/working/trained_bert2bert.json/ (stored 0%)
  adding: kaggle/working/trained_bert2bert.json/tf_model.h5 (deflated 8%)
  adding: kaggle/working/trained_bert2bert.json/config.json (deflated 80%)
huggingface/tokenize

In [63]:
!zip -r tokenizer.zip /working/tokenizer/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: kaggle/working/tokenizer/ (stored 0%)
  adding: kaggle/working/tokenizer/vocab.txt (deflated 53%)
  adding: kaggle/working/tokenizer/tokenizer_config.json (deflated 39%)
  adding: kaggle/working/tokenizer/special_tokens_map.json (deflated 40%)
  adding: kaggle/working/tokenizer/tokenizer.json (deflated 59%)


In [55]:
from IPython.display import FileLink 
FileLink(r'trained_bert2bert_h5.zip')

In [56]:
from IPython.display import FileLink 
FileLink(r'trained_bert2bert_json.zip')

In [64]:
from IPython.display import FileLink 
FileLink(r'tokenizer.zip')

In [58]:
from IPython.display import FileLink 
FileLink(r'tokenizer_h5.zip')

In [65]:
from transformers import TFEncoderDecoderModel, BertTokenizerFast

# Load the trained_bert2bert model
loaded_model = TFEncoderDecoderModel.from_pretrained("trained_bert2bert.h5")

# Load the tokenizer
loaded_tokenizer = BertTokenizerFast.from_pretrained("tokenizer/")


All model checkpoint layers were used when initializing TFEncoderDecoderModel.

All the layers of TFEncoderDecoderModel were initialized from the model checkpoint at /kaggle/working/trained_bert2bert.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFEncoderDecoderModel for predictions without further training.


In [68]:
df_reviews_test = df_reviews.copy()
df_reviews_test = df_reviews_test.iloc[1000:1100, :]
test_reviews = df_reviews_test['Text'].values
test_sums = df_reviews_test['Summary'].values

test_dataloader = DataLoader(test_reviews, test_sums, **dataloader_args)
test_dataset = test_dataloader.list_to_tensor_dataset(test_reviews, test_sums)
test_dataset = test_dataset.batch(training_config.batch_size)
pred_strs = []
gold_strs = []

for batched_input in tqdm(test_dataset):
    pred_str, gold_str, _ = batched_generate_summary(
        loaded_model, 
        loaded_tokenizer, 
        batched_input
    )
    pred_strs.extend(pred_str)
    gold_strs.extend(gold_str)
    
rouge_output = rouge.compute(
    predictions=pred_strs,
    references=gold_strs,
    rouge_types=["rouge1"]
)
pprint(rouge_output)

  0%|          | 0/13 [00:00<?, ?it/s]

{'rouge1': AggregateScore(low=Score(precision=0.3303154761904761, recall=0.3006857548701298, fmeasure=0.31052319277686524), mid=Score(precision=0.3735716089466089, recall=0.34221338383838384, fmeasure=0.3499700360011505), high=Score(precision=0.4189004329004327, recall=0.38829684343434345, fmeasure=0.391930746404438))}


In [70]:
# val0 = list(val_dataset.as_numpy_iterator())[0]
for step, batched_input in enumerate(tqdm(val_dataset)):
    pred_str, gold_str, input_strs = batched_generate_summary(
        loaded_model, 
        loaded_tokenizer, 
        batched_input
    )
    rouge_output = rouge.compute(
        predictions=pred_str,
        references=gold_str,
        rouge_types=["rouge1"]
    )
    print('Rouge report: ')
    print(rouge_output['rouge1'].mid)
    for p_str,g_str,in_str in zip(pred_str, gold_str, input_strs):
        print('='*100)
        print('Review: ' + in_str)
        print('Summary: ' + g_str)
        print('Generated: ' + p_str)
    
    break

  0%|          | 0/250 [00:00<?, ?it/s]

Rouge report: 
Score(precision=0.15556908369408368, recall=0.1751488095238095, fmeasure=0.15963069438301636)
Review: a foreign business executive participating in the ongoing # # # # taiwan business alliance conference said monday he hopes the two sides of the taiwan strait can establish direct transportation links as soon as possible.
Summary: foreign executives advocate direct cross - strait transport links
Generated: foreign trade council says taiwan will cooperate with taiwan
Review: two companies listed on the dar es salaam stock exchange - lrb - dse - rrb - have announced their respective plans of dividend payment to be made next month.
Summary: tanzanian listed companies announce dividend payment plans
Generated: sala to issue new subscribe - subsem rates
Review: defense secretary donald h. rumsfeld spent tuesday in a whirlwind trip around iraq that included meetings with american troops outside the capital, meetings with government officials in baghdad, and a final stop here, a