#Import repository

In [None]:
import os
!git clone https://github.com/sergiuabed/SICK_Summarization.git
ROOT = "/content/SICK_Summarization/extension_2/Tweetsumm"
%cd $ROOT

#Download Tweetsumm TWCS file

In [None]:
!pip install kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download thoughtvector/customer-support-on-twitter
!unzip customer-support-on-twitter.zip -d data


#Install COMET requirements

In [None]:
!bash Comet/download_model.sh
!pip install -U sentence-transformers

import torch
if torch.cuda.is_available():
  DEVICE = "cuda"
  !pip install -r Comet/requirement-gpu.txt
else:
  DEVICE = "cpu"
  !pip install -r Comet/requirement-cpu.txt


#Define preprocessing function

In [None]:
from tweet_sum_processor import TweetSumProcessor
from Comet.comet import Comet, generate_commonsense
from Comet.sbert import select_best_commonsense
from sentence_transformers import SentenceTransformer, util
from google.colab import files
import json
import warnings
from tqdm.notebook import tqdm

TWCS_FILE_PATH = f"{ROOT}/data/twcs/twcs.csv"

def tweetsumm_preprocessing(input_data, split, summary=False):
  warnings.filterwarnings('ignore')
  processor = TweetSumProcessor(TWCS_FILE_PATH)
  comet = Comet("./comet-atomic_2020_BART")
  comet.model.zero_grad()
  sbert = SentenceTransformer("all-MiniLM-L6-v2").to(DEVICE)

  if not os.path.isdir("./dialogue_data"):
    !mkdir ./dialogue_data
  if not os.path.isdir("./COMET_data"):
    !mkdir ./COMET_data
    !mkdir ./COMET_data/tweetsumm
    !mkdir ./COMET_data/tweetsumm/comet_inference
    !mkdir ./COMET_data/tweetsumm/comet_inference/dialogue
    !mkdir ./COMET_data/tweetsumm/comet_inference/summary
    !mkdir ./COMET_data/tweetsumm/sbert
    !mkdir ./COMET_data/tweetsumm/sbert/dialogue
    !mkdir ./COMET_data/tweetsumm/sbert/summary

  with open(input_data, "r") as f:
    dialog_with_summaries = processor.get_dialog_with_summaries(f.readlines())

  with open(f"./dialogue_data/tweetsumm_{split}.json", "w") as f:
    print("Generating dialogue data...")
    data = {}
    for d in dialog_with_summaries:
      dialog = json.loads(d.get_json())
      data[dialog["dialog_id"]] = {"turns":dialog["turns"], "summaries":dialog["summaries"]["abstractive_summaries"]}
    f.write(json.dumps(data, indent=4))

  cs_dialog_file_path = f"./COMET_data/tweetsumm/comet_inference/dialogue/comet_dialogue_{split}.json"
  sbert_cs_dialog_file_path = f"./COMET_data/tweetsumm/sbert/dialogue/sbert_dialogue_{split}.json"
  print("Generating commonsense data for dialogues...")
  with open(cs_dialog_file_path, "w") as f, open(sbert_cs_dialog_file_path, "w") as g:
    comet_data = {}
    sbert_data = {}
    for d in tqdm(dialog_with_summaries):
      dialog = json.loads(d.get_json())
      comet_data[dialog["dialog_id"]] = {}
      sbert_data[dialog["dialog_id"]] = {}
      for i, t in enumerate(dialog["turns"]):
        comet_inference = generate_commonsense(comet, t)
        sbert_selection = select_best_commonsense(sbert, comet_inference)
        comet_data[dialog["dialog_id"]].update({i:comet_inference})
        sbert_data[dialog["dialog_id"]].update({i:sbert_selection})
    f.write(json.dumps(comet_data, indent=4))
    g.write(json.dumps(sbert_data, indent=4))


  if summary:
    cs_summary_file_path = f"./COMET_data/tweetsumm/comet_inference/summary/comet_summary_{split}.json"
    sbert_cs_summary_file_path = f"./COMET_data/tweetsumm/sbert/summary/sbert_summary_{split}.json"
    print("Generating commonsense data for summaries...")
    with open(cs_summary_file_path, "w") as f, open(sbert_cs_summary_file_path, "w") as g:
      comet_data = {}
      sbert_data = {}
      for d in tqdm(dialog_with_summaries):
        dialog = json.loads(d.get_json())
        comet_data[dialog["dialog_id"]] = {}
        sbert_data[dialog["dialog_id"]] = {}
        for i, s in enumerate(dialog["summaries"]["abstractive_summaries"]):
          comet_inference = generate_commonsense(comet, s)
          sbert_selection = select_best_commonsense(sbert, comet_inference)
          comet_data[dialog["dialog_id"]].update({i:comet_inference})
          sbert_data[dialog["dialog_id"]].update({i:sbert_selection})
      f.write(json.dumps(comet_data, indent=4))
      g.write(json.dumps(sbert_data, indent=4))

#Apply preprocessing to the data

In [None]:
TRAIN_FILE_PATH = f"{ROOT}/tweet_sum_data_files/final_train_tweetsum.jsonl"
VALID_FILE_PATH = f"{ROOT}/tweet_sum_data_files/final_valid_tweetsum.jsonl"
TEST_FILE_PATH = f"{ROOT}/tweet_sum_data_files/final_test_tweetsum.jsonl"

print("Processing training data")
tweetsumm_preprocessing(TRAIN_FILE_PATH, "train", True)

print("Processing validation data")
tweetsumm_preprocessing(VALID_FILE_PATH, "valid")

print("Processing test data")
tweetsumm_preprocessing(TEST_FILE_PATH, "test")


#Download data

In [None]:
!zip -r tweetsumm_clean.zip dialogue_data
files.download("tweetsumm_clean.zip")
!zip -r tweetsumm_comet.zip COMET_data
files.download("tweetsumm_comet.zip")