## Notebook with steps to recreate the dataset

1. Get the commments list from the dataset_comment_ids.txt
2. Download the comments using the YouTube Data API 
3. Use the labels to complete the data preprocessing 

### Step 1

You can authenticate with the Google API in various ways (using OAuth, keys etc)
We have provided the code to download the comments, all you need is a key. 
A key can be obtained by opening a project in your google developer console and 
enabling the YouTube Data API for that project. 

The comments are stored in comment_ids.txt
The following cell runs get_comments_from_comment_ids.py and save them in the folder 'raw_comments/'. Obtain a key and save it in keys.txt to download the comments. 

In [None]:
# keys.txt should contain 1 key in each line 
# comment_ids.txt contains the comment IDs 
# raw_comments/ is the save folder

!python get_comments_from_comment_ids.py keys.txt comment_ids.txt raw_comments/ 

### Step 2 

1. Load comments from raw_comments/ 
2. Verify sha1 hash
3. Match with labelling info (labels) 

In [None]:
import pandas as pd 
import numpy as np 
import missingno as msno 
import seaborn as sns
import matplotlib.pyplot as plt 
from pprint import pprint
import random, math
from collections import Counter
from itertools import chain 
from utils import *
import json, os, sys 
from collections import defaultdict
import hashlib 

In [None]:
comment_list = os.listdir("raw_comments/")

In [None]:
# Retrieve comments and check sha1 hash value 


all_comments_retrieved = {}
sha1 = hashlib.sha1() 

with open("comment_ids_with_hash.json")  as fp : 
    doc = json.load(fp)


for comment in comment_list: 
    comment_filepath = os.path.join("raw_comments",comment)
    with open(comment_filepath, encoding='utf-8') as fp : 
        
        comment_doc = json.load(fp)
        if comment_doc[comment] != "Comment has been removed by the user. To consruct full dataset, contact authors." : 
            comment_text = comment_doc[comment] 
            comment_hash = hashlib.sha1(comment_text.encode('utf-8'))
            digest = comment_hash.hexdigest() 
            if digest != doc[comment] : 
                print(f"The comment {comment} has been removed or modified since the collection of this dataset, contact authors for full data.")
                
            else : 
                all_comments_retrieved[comment] = comment_text

In [None]:
len(all_comments_retrieved)

In [None]:
#split comments into sentences 

all_comments_split = {}

for cid, comment in all_comments_retrieved.items():
    all_comments_split[cid] = shrink_delimiters(remove_emoji(remove_punc(comment)))

In [None]:
# Load labelling info 

labels_df = pd.read_csv('labelling_info.csv')

In [None]:
labels_df[["actual_comment_ids","attribution","primary_attribution","secondary_attribution","multiple_attribution_1"]].head()

In [None]:
# Add sentences with labelling info to make the dataset


sentences = []

for index, row in labels_df.iterrows() : 
    comment_id = ".".join(row.actual_comment_ids.split(".")[:-1])
    sentence_index = int(row.actual_comment_ids.split(".")[-1])
    
    if comment_id in all_comments_split.keys():
        #print(all_comments_split[comment_id], "\n", sentence_ids)
        sentences.append(all_comments_split[comment_id][sentence_index])
        
    else : 
        sentences.append(np.nan)
        
labels_df["sentence"] = sentences

print(f"{labels_df['sentence'].isnull().sum()} sentences were not found")
labels_df = labels_df.dropna(subset=["sentence"])

In [None]:
labels_df[["actual_comment_ids","sentence","attribution","primary_attribution","secondary_attribution","multiple_attribution_1"]].head()

In [None]:
# keep information about all topics in a comment - 

topics_sets = {}

unique_comments = set(labels_df.comment_id)

for comment in unique_comments :
    df_temp_sentences = list(labels_df[(labels_df.comment_id == comment) & (labels_df.attribution == 0)].sentence)
    df_temp = labels_df.loc[labels_df.comment_id == comment][['primary_attribution','secondary_attribution','multiple_attribution_1','other_attribution']]
    temp_topic_set = set([x for x in chain.from_iterable(df_temp.values) if not type(x) == float])
    #print(temp_topic_set)
    for sent in df_temp_sentences :
        topics_sets[sent] = temp_topic_set

### Step 3 

1. Chain consecutive sentences of the same comments having same attribution factor. 
2. Save labels for testing. This labels are later used to measure model perf

In [None]:
# Chain consecutive comments with the same attributed factor  

chains = make_comments(labels_df)

#remove chains with less than 4 words

chains_final = [x for x in chains if len(x[0].split()) >= 4]

In [None]:
#chains_final

In [None]:
# Create final dataframe, add negative labels, drop dupes 

chains_df = pd.DataFrame(data=chains_final, columns=['sentences','attribution','attrib_words'])

#chains_df.head()
chains_df_replaced = replace_no_attrib(chains_df, topics_sets)
chains_df_replaced.drop_duplicates(inplace=True, keep=False)

print("Total number of chains", chains_df.shape[0])

In [None]:
# print(chains_df[chains_df.attribution == 1].shape) 
# print(chains_df[chains_df.attribution == 0].shape) 

from collections import defaultdict 
topic_sanity = defaultdict(list)

for index, row in chains_df_replaced.iterrows() : 
    if row.attribution == 1 : 
        topic_sanity[row.sentences].append(row.attrib_words)
    

### Step 4 

Save the label info and the final dataset

In [None]:
chains_df_replaced.to_csv("final_dataset.csv",index=False)

with open('topic_sanity.json', 'w') as fp : 
    json.dump(topic_sanity, fp, indent=4)