# Pegasus 분석


### Setup

In [None]:
!pip install transformers
!pip install sentencepiece


Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 4.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 662 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 89.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 77.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
import os
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import torch 

from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration

In [None]:
from transformers import T5ForConditionalGeneration
from transformers import T5TokenizerFast as T5Tokenizer

### Data

In [None]:
train_path = '/content/drive/MyDrive/Aiffel_Hackathon/Data/train_data_500.csv'
test_path = '/content/drive/MyDrive/Aiffel_Hackathon/Data/test_data_500.csv'

In [None]:
train_df = pd.read_csv(train_path, index_col=0) 
test_df = pd.read_csv(test_path, index_col=0 )
train_df.head()

Unnamed: 0,Original_Filename,Full_Text,FT_Len
0,15783_819108_2,chief executive officer’s statement the berong...,500.0
1,15782_819091_2,chief executive officer’s statement the compan...,500.0
2,15787_819176_2,page 6 toledo mining corporation plc annual re...,500.0
3,15819_820208_2,review 2005 chief executive’s statement tomkin...,500.0
4,15842_821001_2,7 plc annual report and financial statements 2...,500.0


In [None]:
test_df.head()

Unnamed: 0,Original_Filename,sen_500,sen_1000,sen_1500,sen_2000,sen_2500,sen_3000,sen_3500,sen_4000,Full_Text,Summary
0,30777_904926_2,25695 19 march 2018 3 29 pm proof 7 02 s . c ....,business saw growth of 14 percent with the maj...,. strategic progress the group’s strategy enco...,from contract wins on automotive and aerospace...,,,,,25695 19 march 2018 3 29 pm proof 7 02 s . c ...,
1,30783_905079_2,strategic report chief executive’s statement 1...,to manage and report on these exposures more e...,stage in its journey . your commitment and vis...,,,,,,strategic report chief executive’s statement ...,
2,30785_905133_2,summary our dedication to providing our client...,the last two years . this is due to a number o...,see interest from local authorities to procure...,,,,,,summary our dedication to providing our clien...,
3,30785_905134_2,"q a with ceo , david miles 92 percent of tenan...",an area where we can afford to stand still and...,,,,,,,"q a with ceo , david miles 92 percent of tena...",
4,30813_906032_2,strategic report domino’s pizza group plc annu...,accounts 2017 domino’s pizza group plc 09 whil...,", cooks your pizza fresh in a local store , an...","visibility of the brand , improve customer ser...",expect to complete roll-out by q3 of 2018. a f...,"economy and by our own actions , particularly ...","achieves weekly unit sales of over pound 37,00...",ticket size to absorb the cost of delivery . w...,strategic report domino’s pizza group plc ann...,


### Pretrained 모델 가져오기

In [None]:
# Let's load the model and the tokenizer 
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [None]:
MODEL_NAME = "t5-base"
T5_tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
T5_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

### 페가수스로 레이블 만들기

In [None]:
input_max_length = 500

In [None]:
# pegasus 
#predictions = ''
sam_source = tokenizer(text=train_df2['Full_Text'][4], 
                          max_length=input_max_length, 
                          padding=True, 
                          return_tensors="pt",
                          truncation=True)
 
sam_input_ids = sam_source['input_ids']
sam_mask = sam_source['attention_mask']
  
generated_ids = model.generate(
                input_ids = sam_input_ids,
                attention_mask = sam_mask, 
                max_length=150, 
                num_beams=5,
                repetition_penalty=2.5, 
                length_penalty=1.0 
                #early_stopping=True
                )

for g in generated_ids:
  preds = tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) 
  predictions += preds

print(predictions)



In [None]:
len(predictions.split())

25

In [None]:
len(train_df2['Full_Text'][10].split())

303

### T5로 레이블 만들기

In [None]:
# T5
def T5summarize(text):
  text_encoding = T5_tokenizer(
      text,
      max_length=512,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )

  generated_ids = T5_model.generate(
      input_ids=text_encoding["input_ids"],
      attention_mask=text_encoding["attention_mask"],
      max_length=150,
      num_beams=2,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True
  )

  preds = [
           T5_tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
           for gen_id in generated_ids
  ]

  return "".join(preds)

In [None]:
test_df.reset_index(inplace=True, drop=True)

In [None]:
test_df.fillna('', inplace=True)

In [None]:
for i in range(len(test_df['Full_Text'])): 

  sam_summary = ''

  if len(test_df['sen_500'][i]) > 0:
    sum_500 = T5summarize(test_df['sen_500'][i])
    sam_summary += sum_500
  if len(test_df['sen_1000'][i]) > 0:
    sum_1000 = T5summarize(test_df['sen_1000'][i])
    sam_summary += sum_1000
  if len(test_df['sen_1500'][i]) > 0:
    sum_1500 = T5summarize(test_df['sen_1500'][i])
    sam_summary += sum_1500
  if len(test_df['sen_2000'][i]) > 0:
    sum_2000 = T5summarize(test_df['sen_2000'][i])
    sam_summary += sum_2000
  if len(test_df['sen_2500'][i]) > 0:
    sum_2500 = T5summarize(test_df['sen_2500'][i])
    sam_summary += sum_2500
  if len(test_df['sen_3000'][i]) > 0:
    sum_3000 = T5summarize(test_df['sen_3000'][i])
    sam_summary += sum_3000
  if len(test_df['sen_3500'][i]) > 0:
    sum_3500 = T5summarize(test_df['sen_3500'][i])
    sam_summary += sum_3500
  if len(test_df['sen_4000'][i]) > 0:
    sum_4000 = T5summarize(test_df['sen_4000'][i])
    sam_summary += sum_4000

  test_df.loc[i, 'Summary'] = sam_summary 



In [None]:
test_df['Summary'][100]

'bakkavor employs around 19,000 people across 39 sites in the uk, us and china. this year we generated revenue of over pound 1.8 billion. operating profit increased by 5.1 percent from pound 91.5 million to pound 96.2 million.bakkavor group plc 2017 annual report innovation is one of our core values and we are extremely proud of our capabilities and expertise. the second half of 2017 saw volume growth impacted as uk consumers reacted to significant inflationary pressure. this trend has continued into 2018 and is likely to remain until inflation eases.uk performance the uk business generated pound 1,636.3 million in revenue from continuing operations in 2017. like-for-like revenue 2 was pound 1,621.3 million, which is 4.9 percent up on 2016. as expected, margins remained under pressure throughout the year due to inflationary impacts.bakkavor group plc 2017 financial highlights pound million 2017 2016 53 weeks change revenue 1,636.3 1,589.9 4.6 percent 1 2.9 percent like-for-like revenue

In [None]:
test_df.to_csv('/content/drive/MyDrive/Aiffel_Hackathon/Data/test_500_labeladded.csv')

### Train 

In [None]:
train_df.reset_index(inplace=True, drop=True)
train_df.head()

Unnamed: 0,Original_Filename,Full_Text,FT_Len
0,15783_819108_2,chief executive officer’s statement the berong...,500.0
1,15782_819091_2,chief executive officer’s statement the compan...,500.0
2,15787_819176_2,page 6 toledo mining corporation plc annual re...,500.0
3,15819_820208_2,review 2005 chief executive’s statement tomkin...,500.0
4,15842_821001_2,7 plc annual report and financial statements 2...,500.0


In [None]:
for i, doc in enumerate(train_df['Full_Text'][:3]): 

  sum = T5summarize(doc)
  train_df.loc[i, 'Summary'] = sum
 

In [None]:
train_df['Summary'][0]

'the berong nickel mine has been in operation for just over one year and has successfully made shipments of laterite ore to chinese and australian customers. from the commencement of trial metallurgical shipments in january 2007 until the end of march 31 2008, approximately 815,000 tonnes of ore at an average grade of 1.52 percent ni was shipped. the company continues to set new standards in responsible mining in the philippines.'