In [None]:
import pandas as pd
from tqdm.notebook import tqdm_notebook

tqdm_notebook.pandas()

In [None]:
df = pd.read_parquet("temp_reuter_filtered.parquet")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# init
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/Pythia-Chat-Base-7B-v0.16")
model = AutoModelForCausalLM.from_pretrained("togethercomputer/Pythia-Chat-Base-7B-v0.16", device_map="auto", load_in_8bit=True)

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

In [None]:
template = """ Given the following news, output 0 if the ESG impact duration is below 2 years, 1 if the ESG impact duration is between 2 and 5 year and 2 if the ESG impact duration is more than 5 years.  If it's not applicable, output -1. You only need to output the number, and do not need any further explanation.

Example:
"EU Regulators Welcome, Critique New European Sustainability Reporting Standards || Europe’s three primary financial regulatory agencies, the European Supervisory Authorities (ESAs) each announced the release of their opinions on the first set of draft European Sustainability Reporting Standards, which set out the rules and requirements for companies to report on sustainability-related impacts, opportunities and risks under the EU’s upcoming Corporate Sustainable Reporting Directive (CSRD)."
0
"Red States Sue to Stop Biden Administration Rule Allowing ESG Investing in $12 Trillion of Retirement Plans || In a statement announcing the new action, Texas AG Ken Paxton, who is co-leading the lawsuit, said that the DOL’s rule will prioritize “woke” ESG investing over protecting the retirement savings of workers."
1
"ArcelorMittal Leads $120 Million Capital Raise for Green Steel Startup Boston Metal || Brandon Middaugh, director, Microsoft Climate Innovation Fund, said: “Microsoft’s Climate Innovation Fund was created to accelerate technology development and deployment in areas that will have the most meaningful impact on climate. The technology Boston Metal is developing has the potential to deliver affordable green steel at scale, helping to drive cross-industry decarbonization, which is increasingly critical for companies with carbon reduction targets, such as Microsoft.”"
2

Now predict: """

In [None]:
def generate_pythia_model(sentence):
  inputs = tokenizer(f"<human>: {template + sentence} + \n<bot>:", return_tensors='pt').to(model.device)
  outputs = model.generate(**inputs,
                           max_new_tokens=2,
                           do_sample=True,
                           temperature=0.2)
  output_str = tokenizer.decode(outputs[0])
  bot_response = output_str.replace("\n", "").split("<bot>: ")[-1]

  if "-1" in bot_response:
    return -1
  elif "0" in bot_response:
    return 0
  elif "1" in bot_response:
    return 1
  elif "2" in bot_response:
    return 2
  return None

In [None]:
df["pythia_label"] = df["text"].progress_apply(generate_pythia_model)

In [None]:
df.groupby("pythia_label")["pythia_label"].count()

In [None]:
df.to_parquet("reuter_pythia.parquet")