### T5 Pretrained로 Pegasus에 사용할 레이블 만들기


### Setup

In [None]:
!pip install transformers
!pip install sentencepiece


Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 4.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 22.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 51.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 671 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 75.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transfor

In [None]:
import os
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import torch 


In [None]:
from transformers import T5ForConditionalGeneration
from transformers import T5TokenizerFast as T5Tokenizer

### Data

In [None]:
train_path = '/content/drive/MyDrive/Aiffel_Hackathon/Data/train_data_500.csv'
test_path = '/content/drive/MyDrive/Aiffel_Hackathon/Data/test_data_500.csv'

In [None]:
train_df = pd.read_csv(train_path, index_col=0) 
test_df = pd.read_csv(test_path, index_col=0 )
train_df.head()

Unnamed: 0,Original_Filename,Full_Text,FT_Len
0,15783_819108_2,chief executive officer’s statement the berong...,500.0
1,15782_819091_2,chief executive officer’s statement the compan...,500.0
2,15787_819176_2,page 6 toledo mining corporation plc annual re...,500.0
3,15819_820208_2,review 2005 chief executive’s statement tomkin...,500.0
4,15842_821001_2,7 plc annual report and financial statements 2...,500.0


In [None]:
test_df.head()

Unnamed: 0,Original_Filename,sen_500,sen_1000,sen_1500,sen_2000,sen_2500,sen_3000,sen_3500,sen_4000,Full_Text,Summary
0,30777_904926_2,25695 19 march 2018 3 29 pm proof 7 02 s . c ....,business saw growth of 14 percent with the maj...,. strategic progress the group’s strategy enco...,from contract wins on automotive and aerospace...,,,,,25695 19 march 2018 3 29 pm proof 7 02 s . c ...,
1,30783_905079_2,strategic report chief executive’s statement 1...,to manage and report on these exposures more e...,stage in its journey . your commitment and vis...,,,,,,strategic report chief executive’s statement ...,
2,30785_905133_2,summary our dedication to providing our client...,the last two years . this is due to a number o...,see interest from local authorities to procure...,,,,,,summary our dedication to providing our clien...,
3,30785_905134_2,"q a with ceo , david miles 92 percent of tenan...",an area where we can afford to stand still and...,,,,,,,"q a with ceo , david miles 92 percent of tena...",
4,30813_906032_2,strategic report domino’s pizza group plc annu...,accounts 2017 domino’s pizza group plc 09 whil...,", cooks your pizza fresh in a local store , an...","visibility of the brand , improve customer ser...",expect to complete roll-out by q3 of 2018. a f...,"economy and by our own actions , particularly ...","achieves weekly unit sales of over pound 37,00...",ticket size to absorb the cost of delivery . w...,strategic report domino’s pizza group plc ann...,


### T5 Pretrained 모델 가져오기

In [None]:
MODEL_NAME = "t5-base"
T5_tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
T5_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

### T5로 test 데이터 레이블 만들기

In [None]:
# T5
def T5summarize(text):
  text_encoding = T5_tokenizer(
      text,
      max_length=512,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )

  generated_ids = T5_model.generate(
      input_ids=text_encoding["input_ids"],
      attention_mask=text_encoding["attention_mask"],
      max_length=150,
      num_beams=2,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True
  )

  preds = [
           T5_tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
           for gen_id in generated_ids
  ]

  return "".join(preds)

In [None]:
test_df.reset_index(inplace=True, drop=True)

In [None]:
test_df.fillna('', inplace=True)

In [None]:
for i in range(len(test_df['Full_Text'])): 

  sam_summary = ''

  if len(test_df['sen_500'][i]) > 0:
    sum_500 = T5summarize(test_df['sen_500'][i])
    sam_summary += sum_500
  if len(test_df['sen_1000'][i]) > 0:
    sum_1000 = T5summarize(test_df['sen_1000'][i])
    sam_summary += sum_1000
  if len(test_df['sen_1500'][i]) > 0:
    sum_1500 = T5summarize(test_df['sen_1500'][i])
    sam_summary += sum_1500
  if len(test_df['sen_2000'][i]) > 0:
    sum_2000 = T5summarize(test_df['sen_2000'][i])
    sam_summary += sum_2000
  if len(test_df['sen_2500'][i]) > 0:
    sum_2500 = T5summarize(test_df['sen_2500'][i])
    sam_summary += sum_2500
  if len(test_df['sen_3000'][i]) > 0:
    sum_3000 = T5summarize(test_df['sen_3000'][i])
    sam_summary += sum_3000
  if len(test_df['sen_3500'][i]) > 0:
    sum_3500 = T5summarize(test_df['sen_3500'][i])
    sam_summary += sum_3500
  if len(test_df['sen_4000'][i]) > 0:
    sum_4000 = T5summarize(test_df['sen_4000'][i])
    sam_summary += sum_4000

  test_df.loc[i, 'Summary'] = sam_summary 



In [None]:
test_df.describe()

Unnamed: 0,Original_Filename,sen_500,sen_1000,sen_1500,sen_2000,sen_2500,sen_3000,sen_3500,sen_4000,Full_Text,Summary
count,328,328,328,328.0,328.0,328.0,328.0,328.0,328.0,328,328
unique,328,328,328,297.0,238.0,158.0,99.0,62.0,31.0,328,328
top,33038_146969_2,chief executive’s review 2017 performance reve...,of the power disruption we experienced last oc...,,,,,,,10 enquest plc annual report 2017 chief execu...,despite early signs of recovery in the wider o...
freq,1,1,1,32.0,91.0,171.0,230.0,267.0,298.0,1,1


In [None]:
test_df.to_csv('/content/drive/MyDrive/Aiffel_Hackathon/Data/test_500_labeladded_final.csv')

### T5로 train 데이터 레이블 만들기 

In [None]:
train_df.reset_index(inplace=True, drop=True)
train_df.head()

Unnamed: 0,Original_Filename,Full_Text,FT_Len
0,15783_819108_2,chief executive officer’s statement the berong...,500.0
1,15782_819091_2,chief executive officer’s statement the compan...,500.0
2,15787_819176_2,page 6 toledo mining corporation plc annual re...,500.0
3,15819_820208_2,review 2005 chief executive’s statement tomkin...,500.0
4,15842_821001_2,7 plc annual report and financial statements 2...,500.0


In [None]:
train_df.describe()

Unnamed: 0,FT_Len
count,11954.0
mean,444.643299
std,123.898723
min,1.0
25%,500.0
50%,500.0
75%,500.0
max,500.0


In [None]:
train_df_part2 = train_df[1851:3000].copy()

In [None]:
train_df_part2.reset_index(drop=True, inplace=True)
train_df_part2.head()

Unnamed: 0,Original_Filename,Full_Text,FT_Len
0,5129_814128_2,thomson intermedia plc annual accounts 2007 pa...,500.0
1,5130_814164_2,thomson intermedia plc annual report and accou...,500.0
2,5131_258934_2,ebiquity plc annual report and accounts year e...,500.0
3,5157_259657_2,most resilient form of pci compliance availabl...,500.0
4,5178_261004_2,eco city vehicles plc . reports and accounts 2...,500.0


In [None]:
for i, doc in enumerate(train_df_part2['Full_Text']): 

  sum = T5summarize(doc)
  train_df_part2.loc[i, 'Summary'] = sum
 

In [None]:
train_df_part2.to_csv('/content/drive/MyDrive/Aiffel_Hackathon/Data/Bo_data/train_500_part2.csv')

In [None]:
train_df_part1 = train_df[:1000].copy()
train_df_part1.reset_index(drop=True, inplace=True)
train_df_part1.head()

Unnamed: 0,Original_Filename,Full_Text,FT_Len
0,15783_819108_2,chief executive officer’s statement the berong...,500.0
1,15782_819091_2,chief executive officer’s statement the compan...,500.0
2,15787_819176_2,page 6 toledo mining corporation plc annual re...,500.0
3,15819_820208_2,review 2005 chief executive’s statement tomkin...,500.0
4,15842_821001_2,7 plc annual report and financial statements 2...,500.0


In [None]:
len(train_df_part1)

1000

In [None]:
for i, doc in enumerate(train_df_part1['Full_Text']): 

  sum = T5summarize(doc)
  train_df_part1.loc[i, 'Summary'] = sum
 

In [None]:
train_df_part1.head()

Unnamed: 0,Original_Filename,Full_Text,FT_Len,Summary
0,15783_819108_2,chief executive officer’s statement the berong...,500.0,the berong nickel mine has been in operation f...
1,15782_819091_2,chief executive officer’s statement the compan...,500.0,berong nickel corporation is setting new stand...
2,15787_819176_2,page 6 toledo mining corporation plc annual re...,500.0,toledo mining corporation plc reported a pre-t...
3,15819_820208_2,review 2005 chief executive’s statement tomkin...,500.0,a key component of this is new product develop...
4,15842_821001_2,7 plc annual report and financial statements 2...,500.0,topps has seen its position as the uk’s number...


In [None]:
len(train_df_part1)

1000

In [None]:
train_df_part1.to_csv('/content/drive/MyDrive/Aiffel_Hackathon/Data/Bo_data/train_0_1000.csv')

In [None]:
train_df_part1_2 = train_df[1000:1851].copy()
train_df_part1_2.reset_index(drop=True, inplace=True)
train_df_part1_2.head()

Unnamed: 0,Original_Filename,Full_Text,FT_Len
0,10789_566642_2,strategic report annual report and accounts 20...,500.0
1,10788_566462_2,chief executive’s review delivering growth acr...,500.0
2,10798_380664_2,the rising importance of recruitment companies...,500.0
3,10803_566746_2,nakama group plc annual report 3 chief executi...,500.0
4,10856_568159_2,10 chief executive’s report phil white chief e...,500.0


In [None]:
len(train_df_part1_2)

851

In [None]:
for i, doc in enumerate(train_df_part1_2['Full_Text']): 

  sum = T5summarize(doc)
  train_df_part1_2.loc[i, 'Summary'] = sum

In [None]:
train_df_part1_2.to_csv('/content/drive/MyDrive/Aiffel_Hackathon/Data/Bo_data/train_1000_1851.csv')

In [None]:
train_df_part1_2.head()

Unnamed: 0,Original_Filename,Full_Text,FT_Len,Summary
0,10789_566642_2,strategic report annual report and accounts 20...,500.0,nahl group plc 10 business model in action con...
1,10788_566462_2,chief executive’s review delivering growth acr...,500.0,jd williams delivered strong performance in te...
2,10798_380664_2,the rising importance of recruitment companies...,500.0,highams has hired 15 new people in the last qu...
3,10803_566746_2,nakama group plc annual report 3 chief executi...,500.0,nakama was developed with a specific aim to se...
4,10856_568159_2,10 chief executive’s report phil white chief e...,500.0,phil white chief executive this year the chief...


In [None]:
len(train_df_part1_2)

851

In [None]:
train_3000_3500 = train_df[3000:3500].copy()
train_3000_3500.reset_index(drop=True, inplace=True)
train_3000_3500.head()

Unnamed: 0,Original_Filename,Full_Text,FT_Len
0,18619_351674_2,"expand our offerings , and delight our custome...",500.0
1,18623_354413_2,"health , safety and the environment hse contin...",500.0
2,18635_357548_2,since then have received a total annual return...,500.0
3,18634_357529_2,greater critical mass and higher recognition w...,500.0
4,18631_357474_2,ordinary shares will be eligible for capital g...,500.0


In [None]:
for i, doc in enumerate(train_3000_3500['Full_Text']): 

  sum = T5summarize(doc)
  train_3000_3500.loc[i, 'Summary'] = sum

In [None]:
train_3000_3500.to_csv('/content/drive/MyDrive/Aiffel_Hackathon/Data/Bo_data/train_3000_3500.csv')