In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip3 install transformers accelerate datasets evaluate
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

Successfully installed accelerate-0.23.0 datasets-2.14.5 dill-0.3.7 evaluate-0.4.0 huggingface-hub-0.17.2 multiprocess-0.70.15 responses-0.18.0 safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.2 xxhash-3.3.0


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
device = "cuda:0"
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
def paraphrase(
    question,
    num_beams=10,
    num_beam_groups=5,
    num_return_sequences=3,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )
    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return res

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
text = 'UK Secretary of State for Work and Pensions, Thérèse Coffey unveiled plans for new climate risk disclosures from pension schemes. The new plans will initially apply to the 100 largest occupational pension schemes – those with £5 billion or more in assets, and including all authorised master trusts – requiring them to assess and report on the financial risks of climate change within their portfolios by the end of 2022. The following year, the rule will be extended to schemes with £1 billion or more in assets.'
paraphrase(text,num_return_sequences=2)




['Thérèse Coffey, the Secretary of State for Work and Pensions in the UK, revealed proposals for new climate risk disclosures from pension schemes. These plans will initially apply to the 100 largest occupational pension funds with £5 billion or more in assets, including all authorised master trusts, and will require them to assess and report on the financial risks of climate change within their portfolios by the end of 2022. The rule will be extended next year to schemes with £1 billion plus assets.',
 'Thérèse Coffey, the Secretary of State for Work and Pensions in the UK, revealed proposals for new climate risk disclosures from pension schemes. These plans will initially apply to the 100 largest occupational pension funds with £5 billion or more in assets, including all authorised master trusts, and will require them to assess and report on the financial risks of climate change within their portfolios by the end of 2022. The rule will be extended next year to schemes with £1 billion

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
eng_df = pd.read_csv("/content/drive/MyDrive/FinNLP/Data/Japanese.csv")
#impact_type_mapping = {
#    'Opportunity': 0,
#    'Risk': 1,
#}
#eng_df['impact_type'] = eng_df['impact_type'].map(impact_type_mapping).astype(int)
train_eng, test_eng = train_test_split(eng_df, test_size=0.2, random_state=42)
print("Train shape:", train_eng.shape)
print("Test shape:", test_eng.shape)
column="translated"
subset_columns = [column, 'Sentiment']
train_eng = train_eng[subset_columns]
test_eng = test_eng[subset_columns]

Train shape: (716, 6)
Test shape: (180, 6)


In [None]:
eng_df['Sentiment'].fillna("Not Available", inplace=True)


In [None]:
eng_df["Sentiment"].value_counts()

Positive         460
Not Available    316
Neutral           71
Negative          49
Name: Sentiment, dtype: int64

In [None]:
eng_df

Unnamed: 0.1,Unnamed: 0,ID,Sentence,URL,Sentiment,translated
0,0,1,本項においては、将来に関する事項が含まれていますが、当該事項は2020年３月31日現在におい...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,,This section includes matters related to the f...
1,1,2,トヨタは経営の基本方針を「トヨタ基本理念」として掲げており、その実現に向けた努力が、企業価値...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,,Toyota has set the basic management policy as ...
2,2,3,その内容は次のとおりです。,https://disclosure2dl.edinet-fsa.go.jp/searchd...,,The content is as follows.
3,3,11,今後の世界経済は、新型コロナウイルスの影響により、多くの国・地域での急激な落ち込みが懸念されます。,https://disclosure2dl.edinet-fsa.go.jp/searchd...,,"In the future, the world economy is concerned ..."
4,4,12,自動車の生産面、販売面にも既に大きな影響が及んでいます。,https://disclosure2dl.edinet-fsa.go.jp/searchd...,,It has already had a major impact on car produ...
...,...,...,...,...,...,...
891,891,825,現時点においてかかる調査の進展、結果および終結の時期ならびにそのエプソンの経営成績および今後...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Negative,It is difficult to predict the progress of the...
892,892,826,エプソンは、プリンティングソリューションズ事業、ビジュアルコミュニケーション事業およびウエア...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Negative,Epson develops a variety of business activitie...
893,893,832,しかしながら、常に有効な内部統制システムを構築および運用できる保証はなく、また、内部統制シス...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Negative,"However, there is no guarantee that you can al..."
894,894,836,エプソンは、研究開発、調達、製造、物流、販売およびサービスの拠点を世界に展開していますが、こ...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Negative,Epson has developed the bases of research and ...


In [None]:
eng_df

Unnamed: 0.1,Unnamed: 0,ID,Sentence,URL,Sentiment,translated
0,0,1,本項においては、将来に関する事項が含まれていますが、当該事項は2020年３月31日現在におい...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Not Available,This section includes matters related to the f...
1,1,2,トヨタは経営の基本方針を「トヨタ基本理念」として掲げており、その実現に向けた努力が、企業価値...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Not Available,Toyota has set the basic management policy as ...
2,2,3,その内容は次のとおりです。,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Not Available,The content is as follows.
3,3,11,今後の世界経済は、新型コロナウイルスの影響により、多くの国・地域での急激な落ち込みが懸念されます。,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Not Available,"In the future, the world economy is concerned ..."
4,4,12,自動車の生産面、販売面にも既に大きな影響が及んでいます。,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Not Available,It has already had a major impact on car produ...
...,...,...,...,...,...,...
891,891,825,現時点においてかかる調査の進展、結果および終結の時期ならびにそのエプソンの経営成績および今後...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Negative,It is difficult to predict the progress of the...
892,892,826,エプソンは、プリンティングソリューションズ事業、ビジュアルコミュニケーション事業およびウエア...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Negative,Epson develops a variety of business activitie...
893,893,832,しかしながら、常に有効な内部統制システムを構築および運用できる保証はなく、また、内部統制シス...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Negative,"However, there is no guarantee that you can al..."
894,894,836,エプソンは、研究開発、調達、製造、物流、販売およびサービスの拠点を世界に展開していますが、こ...,https://disclosure2dl.edinet-fsa.go.jp/searchd...,Negative,Epson has developed the bases of research and ...


In [None]:
from tqdm import tqdm
content=eng_df["translated"].tolist()
impact=eng_df["Sentiment"].tolist()
final_content=[]
final_impact=[]
for i in tqdm(range(len(impact))):
  if impact[i]=="Not Available" or impact[i]=="Positive":
    final_content.append(content[i])
    final_impact.append(impact[i])
  elif impact[i]=="Neutral":
    x=paraphrase(content[i],num_return_sequences=3)
    for j in x:
      final_content.append(j)
      final_impact.append(impact[i])
    final_content.append(content[i])
    final_impact.append(impact[i])
  else:
    x=paraphrase(content[i],num_return_sequences=4)
    for j in x:
      final_content.append(j)
      final_impact.append(impact[i])
    final_content.append(content[i])
    final_impact.append(impact[i])

100%|██████████| 896/896 [18:35<00:00,  1.25s/it]


In [None]:
print

In [None]:
paraphrase=pd.DataFrame()
paraphrase["news_content"]=final_content
paraphrase["impact_type"]=final_impact

In [None]:
paraphrase["impact_type"].value_counts()

Positive         460
Not Available    316
Neutral          284
Negative         245
Name: impact_type, dtype: int64

In [None]:
paraphrase.to_csv("Japanese_Paraphrased.csv")

In [None]:
test_eng["news_content"][697]

'UK Secretary of State for Work and Pensions, Thérèse Coffey unveiled plans for new climate risk disclosures from pension schemes. The new plans will initially apply to the 100 largest occupational pension schemes – those with £5 billion or more in assets, and including all authorised master trusts – requiring them to assess and report on the financial risks of climate change within their portfolios by the end of 2022. The following year, the rule will be extended to schemes with £1 billion or more in assets.'

In [None]:
test_eng.iloc[0]

news_content    UK Secretary of State for Work and Pensions, T...
impact_type                                                     0
Name: 697, dtype: object