# Preprocessing for Tweetsumm dataset
This notebook is used to obtain structured JSON files that can be used as input for the Comet model.
The output files have the following structure
- ```train_dialogs```, ```test_dialogs``` and ```val_dialogs``` are dictionaries where each key is the unique ID of a conversation and the corresponding value is a list of the sentences of that conversation (for each sentence we kept track of its original text and the author of that sentence)
- ```train_summaries```, ```test_summaries``` and ```val_summaries``` are dictionaries the keys are again the IDs of the conversations and each value contains one of the given summaries as a list of sentences.

In [1]:
import pandas as pd
import json
import jsonlines
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
tqdm.pandas()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Teresa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# import dataset of all the tweets
df = pd.read_csv('datasets/all_tweets/twcs.csv')
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [3]:
# function to remove URLs from text
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+')
    return url_pattern.sub('', text)

# apply the function to the 'text' column 
# and tokenize the text into sentences
df["text"] = df["text"].apply(remove_urls)
df["text"] = df["text"].progress_apply(lambda x: sent_tokenize(x))
df.head()

100%|██████████| 2811774/2811774 [05:33<00:00, 8427.44it/s] 


Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,"[@115712 I understand., I would like to assist...",2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,[@sprintcare and how do you propose we do that],,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,[@sprintcare I have sent several private messa...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,[@115712 Please send us a Private Message so t...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,[@sprintcare I did.],4.0,6.0


In [None]:
# train, test, and validation sumamries 
train_tw = []
with jsonlines.open('datasets/tweetsumm/train_tweetsum.jsonl') as reader:
    for obj in reader:
        train_tw.append(obj)

test_tw = []
with jsonlines.open('datasets/tweetsumm/test_tweetsum.jsonl') as reader:
    for obj in reader:
        test_tw.append(obj)

val_tw = []
with jsonlines.open('datasets/tweetsumm/valid_tweetsum.jsonl') as reader:
    for obj in reader:
        val_tw.append(obj)

In [None]:
print('number of train dialogues: ', len(train_tw))
print('number of test dialogues: ', len(test_tw))
print('number of val dialogues: ', len(val_tw))

number of train dialogues:  879
number of test dialogues:  110
number of val dialogues:  110


In [None]:
# each dialogue is a dictionary with the following keys:
print(train_tw[0].keys())
for k in train_tw[0].keys():
    print(k,":", (train_tw[0][k]))

dict_keys(['conversation_id', 'tweet_ids_sentence_offset', 'annotations'])
conversation_id : b065262210783596c1fe79466b8f8985
tweet_ids_sentence_offset : [{'tweet_id': 87076, 'sentence_offsets': ['[0, 140]', '[141, 151]', '[152, 175]']}, {'tweet_id': 87075, 'sentence_offsets': ['[0, 36]']}, {'tweet_id': 87074, 'sentence_offsets': ['[0, 40]', '[41, 139]']}, {'tweet_id': 87073, 'sentence_offsets': ['[0, 61]']}, {'tweet_id': 87072, 'sentence_offsets': ['[0, 18]', '[19, 87]']}, {'tweet_id': 87071, 'sentence_offsets': ['[0, 75]']}, {'tweet_id': 87070, 'sentence_offsets': ['[0, 15]', '[16, 94]', '[95, 162]']}, {'tweet_id': 87069, 'sentence_offsets': ['[0, 72]']}, {'tweet_id': 87068, 'sentence_offsets': ['[0, 55]', '[56, 134]', '[135, 211]', '[212, 235]']}]
annotations : [{'extractive': [{'tweet_id': 87076, 'sentence_offset': '[0, 140]'}, {'tweet_id': 87074, 'sentence_offset': '[41, 139]'}, {'tweet_id': 87073, 'sentence_offset': '[0, 61]'}], 'abstractive': ['Customer enquired about his Iphone

In [82]:
# example:
# retrieve a conversation and ist abstractive summaries:
dialogue_number = 0

print('retrieved conversation:')
for dic in train_tw[dialogue_number]['tweet_ids_sentence_offset']:
    id=dic['tweet_id']
    print(df[df['tweet_id']==id]['author_id'].values[0], ':', df[df['tweet_id']==id]['text'].values[0])

print()
print('abstractive summaries:')
for dic in train_tw[dialogue_number]['annotations']:
    print(dic['abstractive'])

135060 : So neither my iPhone nor my Apple Watch are recording my steps/activity, and Health doesn’t recognise either source anymore for some reason. Any ideas? https://t.co/m9DPQbkftD
135060 : @AppleSupport please read the above.
AppleSupport : @135060 Let’s investigate this together. To start, can you tell us the software versions your iPhone and Apple Watch are running currently?
135060 : @AppleSupport My iPhone is on 11.1.2, and my watch is on 4.1.
AppleSupport : @135060 Thank you. Have you tried restarting both devices since this started happening?
135060 : @AppleSupport I’ve restarted both, also un-paired then re-paired the watch.
AppleSupport : @135060 Got it. When did you first notice that the two devices were not talking to each other. Do the two devices communicate through other apps such as Messages?
135060 : @AppleSupport Yes, everything seems fine, it’s just Health and activity.
AppleSupport : @135060 Let’s move to DM and look into this a bit more. When reaching out in DM,

In [237]:
def retrieve_dialogue(split, dialogue_number):
    dialog = []
    for dic in split[dialogue_number]['tweet_ids_sentence_offset']:
        id=dic['tweet_id']
        author_id = df[df['tweet_id']==id]['author_id'].values[0]
        sentences = df[df['tweet_id']==id]['text'].values[0]
        for sentence in sentences:
            dialog.append({'sentence':sentence, 'author_id':author_id})
    return dialog

def retrieve_summary(split, dialogue_number, number_of_summaries='all'):
    summaries = []
    for dic in split[dialogue_number]['annotations']:
        summaries.append(dic['abstractive'])
        if len(summaries) == number_of_summaries:
            break
    if number_of_summaries == 1:
        return summaries[0]
    return summaries

In [200]:
# rename the keys of the dictionaries as train_0, train_1, ...test_0, test_1, ...val_0, val_1, ...
train_dict_key = {train_tw[i]['conversation_id']: 'train_'+str(i) for i in range(len(train_tw))}
test_dict_key = {test_tw[i]['conversation_id']: 'test_'+str(i) for i in range(len(test_tw))}
val_dict_key = {val_tw[i]['conversation_id']: 'val_'+str(i) for i in range(len(val_tw))}

In [234]:
# function to retrieve all the dialogues and summaries of train/test/val
def retrieve_all_dialogues(split):
    dialogs = {}
    summaries = {}
    if split==train_tw:key_dict = train_dict_key
    elif split==test_tw: key_dict = test_dict_key
    elif split==val_tw: key_dict = val_dict_key
    else: raise ValueError("split must be train_tw, test_tw or val_tw")
    for i in tqdm(range(len(split))):
        k = split[i]['conversation_id']
        conv_id = key_dict[k]
        dialogs[conv_id]= retrieve_dialogue(split, i)
        summaries[conv_id]= retrieve_summary(split, i, 1) # only one summary per dialogue
    return dialogs, summaries

In [241]:
train_dialogs, train_summaries = retrieve_all_dialogues(train_tw)
test_dialogs, test_summaries = retrieve_all_dialogues(test_tw)
val_dialogs, val_summaries = retrieve_all_dialogues(val_tw)

100%|██████████| 879/879 [01:48<00:00,  8.10it/s]
100%|██████████| 110/110 [00:12<00:00,  8.68it/s]
100%|██████████| 110/110 [00:12<00:00,  8.73it/s]


In [248]:
# specify the file path where you want to save the JSON data
json_file_paths = "datasets/processed_tweetsumm_data/train_dialogs.json", "datasets/processed_tweetsumm_data/train_summaries.json", "datasets/processed_tweetsumm_data/test_dialogs.json", "datasets/processed_tweetsumm_data/test_summaries.json", "datasets/prprocessed_tweetsumm_dataocess_data/val_dialogs.json", "datasets/process_dprocessed_tweetsumm_dataata/val_summaries.json"
data_to_save = [train_dialogs, train_summaries, test_dialogs, test_summaries, val_dialogs, val_summaries]

# Open the file in write mode and use json.dump to save the data
for json_file_path, data_to_save in zip(json_file_paths, data_to_save):
    with open(json_file_path, 'w') as json_file:
        json.dump(data_to_save, json_file)
    print(f'file saved at {json_file_path}')