In [25]:
# ------------------------- # 
#        SET - UP           # 
# ------------------------- # 

# ---- Requirements ----- # 

#!pip install datasets
#!pip install sentencepiece
#!pip install transformers
#!pip install jsonlines

import csv
import datasets
from google.colab import drive
import huggingface_hub
import jsonlines
import json
import pandas as pd
import re
import sys

# ----- Check if GPU is connected ----- # 
gpu_info = !nvidia-smi -L
gpu_info = "\n".join(gpu_info)
if gpu_info.find("failed") >= 0:
    print("Not connected to a GPU")
else:
    print(gpu_info)

# ----- Mounting Google Drive ----- # 

drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/CIS6930_final')

# ----- Importing TweetSum processing module ----- #
from tweet_sum_processor import TweetSumProcessor

# ----------------------------------------------------------------------

# ------------------------- # 
#  PRE-PROCESSING FUNCTIONS # 
# ------------------------- # 


def get_inputs(json_format):
  '''
  ---------------------
  Input: Dictionary containing the metadata for one tweet conversation
  Output: Concatenated string containing the content of one conversation. 
  
  Notes: 
          Special characters inserted for links and transitions between speaker. 
          Anonymized usernames are removed, as they do not add value to the text, 
          as they are usually just located at the beginning of the tweet by default 
          (feature of threads). Whereas usernames containing the name of the business 
          are retained for contextual purpose. 
  ---------------------
  '''
  dialogue = json_format['dialog']['turns']
  full_text = []
  for i in dialogue:
    string = ' '.join(i['sentences'])
    full_text.append(string + " <BR>")
  conversation = ' '.join(full_text)
  by_word = conversation.split(' ')
  for i in range(0, len(by_word)):
    if "https" in by_word[i]:
      by_word[i] = "<LINK>"
    if "@" in by_word[i]:
      if by_word[i][1:].isnumeric():
        by_word[i] = ''
  text = ' '.join(by_word)
  text = re.sub(r'[^a-zA-Z0-9,!.?<> ]', '', text)
  text = re.sub(r'(\W)(?=\1)', '', text)
  return text
  
def get_summary(json_format): 
  '''
  ---------------------
  Input: Dictionary containing the metadata for one tweet conversation
  Output: The text of a single human-generated summary for that one tweet
  ---------------------
  '''
  temp = json_format['summaries']['abstractive_summaries'][0]
  summary = ' '.join(temp)
  return summary

def prepare_data(file_name, processor):
  '''
  Processing the TweetSum dataset so that it can be read as a HuggingFace dataset. 
  ---------------------
  Input: Path to a dataset file and the TweetSum processor
  Output: The inputs and summaries for the given data
  ---------------------
  '''
  inputs = []
  summaries = []
  with open('/content/drive/MyDrive/CIS6930_final/' + file_name) as f:
    dialog_with_summaries = processor.get_dialog_with_summaries(f.readlines())
    for dialog_with_summary in dialog_with_summaries:
      try:
        json_format = json.loads(dialog_with_summary.get_json())
        inputs.append(get_inputs(json_format))
        summaries.append(get_summary(json_format))
      except TypeError:
        pass
  return inputs, summaries

# ----------------------------------------------------------------------

# ------------------------- # 
#           "MAIN"          # 
# ------------------------- # 

# --- "Fake" main function because this is a notebook and not a script :)

# ------ Process data 

processor = TweetSumProcessor('/content/drive/MyDrive/CIS6930_final/kaggle_files/twcs.csv')
train_inputs, train_summs = prepare_data('final_train_tweetsum.jsonl', processor)
valid_inputs, valid_summs = prepare_data('final_valid_tweetsum.jsonl', processor)
test_inputs, test_summs = prepare_data('final_test_tweetsum.jsonl', processor)

# ----- Save as CSVs

train = pd.DataFrame({"inputs": train_inputs, "summaries": train_summs})
train.to_csv('/content/drive/MyDrive/CIS6930_final/tweetsum_train.csv', index=False)

valid = pd.DataFrame({"inputs": valid_inputs, "summaries": valid_summs})
valid.to_csv('/content/drive/MyDrive/CIS6930_final/tweetsum_valid.csv', index=False)

test = pd.DataFrame({"inputs": test_inputs, "summaries": test_summs})
test.to_csv('/content/drive/MyDrive/CIS6930_final/tweetsum_test.csv', index=False)


Not connected to a GPU
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
