<a href="https://colab.research.google.com/github/s295103/Tweetsumm/blob/main/Tweetsumm_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Tweetsumm and COMET repositories

In [None]:
import os

In [None]:
if os.path.isdir("./Tweetsumm"):
  !rm -r Tweetsumm
!git clone https://github.com/s295103/Tweetsumm.git

In [None]:
if os.path.isdir("./XSummarization"):
  !rm -r XSummarization
!git clone https://github.com/s295103/XSummarization.git

#Download Tweetsumm TWCS file

In [None]:
!pip install kaggle
!mkdir ~/.kaggle
!cp Tweetsumm/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download thoughtvector/customer-support-on-twitter
!unzip customer-support-on-twitter.zip -d Tweetsumm/data


#Install COMET requirements

In [None]:
!bash XSummarization/Comet/download_model.sh
!pip install -U sentence-transformers

import torch
if torch.cuda.is_available():
  device = "cuda"
  !pip install -r XSummarization/Comet/requirement-gpu.txt
else:
  device = "cpu"
  !pip install -r XSummarization/Comet/requirement-cpu.txt


#Define preprocessing function

In [None]:
from Tweetsumm.tweet_sum_processor import TweetSumProcessor
from XSummarization.Comet.comet import Comet, generate_commonsense
from google.colab import files
import json
import warnings
from tqdm.notebook import tqdm

def sick_preprocessing(twcs_file, input_data, output_data, commonsense_dialog, commonsense_summ=None):
  warnings.filterwarnings('ignore')
  processor = TweetSumProcessor(twcs_file)
  comet = Comet("./comet-atomic_2020_BART")
  comet.model.zero_grad()

  with open(input_data, "r") as f:
    dialog_with_summaries = processor.get_dialog_with_summaries(f.readlines())

  with open(output_data, "w") as f:
    print("Refactoring dialogue data...")
    dialog_json = [d.get_json() for d in dialog_with_summaries]
    f.writelines("\n".join(dialog_json))

  with open(commonsense_dialog, "w") as f:
    print("Generating commonsense data for dialogues...")
    for d in tqdm(dialog_with_summaries):
      dialog = d.get_dialog()
      turns = [str(t) for t in dialog.get_turns()]
      commonsense = [generate_commonsense(comet, t) for t in turns]
      line = {
          "dialog_id": dialog.get_dialog_id(),
          "commonsense":commonsense
        }
      f.write(json.dumps(line))

  if commonsense_summ:
    with open(commonsense_summ, "w") as f:
      print("Generating commonsense data for summaries...")
      for d in tqdm(dialog_with_summaries):
        dialog = d.get_dialog()
        summaries = [str(s) for s in d.get_abstractive_summaries()]
        commonsense = [generate_commonsense(comet, s) for s in summaries]
        line = {
           "dialog_id": dialog.get_dialog_id(),
           "commonsense":commonsense
         }
        f.write(json.dumps(line))


#Apply preprocessing to the data

In [None]:
if not os.path.isdir("/content/clean_data"):
  !mkdir clean_data
if not os.path.isdir("/content/COMET_data"):
  !mkdir COMET_data
  !mkdir COMET_data/tweetsumm
  !mkdir COMET_data/tweetsumm/dialogue
  !mkdir COMET_data/tweetsumm/summary

TWCS_FILE_PATH = "/content/Tweetsumm/data/twcs/twcs.csv"
TRAIN_FILE_PATH = "/content/Tweetsumm/tweet_sum_data_files/final_train_tweetsum.jsonl"
VALID_FILE_PATH = "/content/Tweetsumm/tweet_sum_data_files/final_valid_tweetsum.jsonl"
TEST_FILE_PATH = "/content/Tweetsumm/tweet_sum_data_files/final_test_tweetsum.jsonl"

CLEAN_TRAIN_FILE_PATH = "/content/clean_data/tweetsumm_train.json"
CLEAN_VALID_FILE_PATH = "/content/clean_data/tweetsumm_validation.json"
CLEAN_TEST_FILE_PATH = "/content/clean_data/tweetsumm_test.json"

COMET_DIALOGUE_TRAIN_FILE_PATH = "/content/COMET_data/tweetsumm/dialogue/comet_train.json"
COMET_DIALOGUE_VALID_FILE_PATH = "/content/COMET_data/tweetsumm/dialogue/comet_validation.json"
COMET_DIALOGUE_TEST_FILE_PATH = "/content/COMET_data/tweetsumm/dialogue/comet_test.json"
COMET_SUMMARY_TRAIN_FILE_PATH = "/content/COMET_data/tweetsumm/summary/comet_train.json"

print("Processing training data")
sick_preprocessing(TWCS_FILE_PATH, TRAIN_FILE_PATH, CLEAN_TRAIN_FILE_PATH, COMET_DIALOGUE_TRAIN_FILE_PATH, COMET_SUMMARY_TRAIN_FILE_PATH)

print("Processing validation data")
sick_preprocessing(TWCS_FILE_PATH, VALID_FILE_PATH, CLEAN_VALID_FILE_PATH, COMET_DIALOGUE_VALID_FILE_PATH)

print("Processing test data")
sick_preprocessing(TWCS_FILE_PATH, TEST_FILE_PATH, CLEAN_TEST_FILE_PATH, COMET_DIALOGUE_TEST_FILE_PATH)


#Download data

In [None]:
!zip -r tweetsumm_clean.zip clean_data
files.download("tweetsumm_clean.zip")
!zip -r tweetsumm_comet.zip COMET_data
files.download("tweetsumm_comet.zip")