## Import the necessary libraries

In [1]:
import os
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
import re
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load(disable = ["parser", "tagger", "ner"])

import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import statistics

## Helper Functions

### mapping

In [12]:
# Mapping for the labels
mapping = {"false" : 0,
           "true" : 1, 
           "unverified" : 2, 
           "non-rumor" : 3}

### Write Data

In [13]:
def write_jsonl(data, path):
    
    """ Takes in a list of json records and a path. Writes the data to a .jsonl file"""
    
    with open(path, "w", encoding = "UTF-8") as f:
        
        for item in data:
            json.dump(item, f)
            f.write("\n")

### Clean line

In [15]:
def clean_line(text):
    
    """ Cleans a list in a string format """
    
    text = text.replace("[", "")
    text = text.replace("]", "")
    text = text.replace("'", "")
    text = text.replace("\n", "")
    text = text.strip()
    
    return text

### Read source tweets

In [18]:
def read_source_tweets(path):
    
    """ Read in data based on specified data path and return a dictionary with the id of the tweet as the key and content as the value """
    
    data = {}
    
    for line in open(path, "r", encoding = "UTF-8"):
        
        line = json.loads(line)
        tweet_id = line["id_str"]
        tweet = line["text"]
        
        data[tweet_id] = tweet
        
    return data

### Read labels

In [19]:
def read_labels(path):
    
    """ Read in data based on specified data path and return a dictionary with the id of the tweet as the key and content as the value """
    
    labels_dict = {}
    
    for line in open(path, "r", encoding = "UTF-8"):
        
        line_array = line.split(":")
        
        tweet_id = line_array[1].strip()
        label = mapping[line_array[0].strip()]
        
        labels_dict[tweet_id] = label
    
    return labels_dict

### Bin each record into different time bins

In [20]:
def map_time_bins(time, size, interval):
    
    """ Takes in the time delay and bin it according to the number of defined time bins (size) and interval for each bin (interval) """
    
    # Max is size -1 (First index starts from 0) 
    bin_num = min(int(time // float(interval)), size - 1)
    bin_num = max(0, bin_num)
    
    return bin_num

### Read data

In [8]:
def read_data(path):
    
    data_child_array = []
    time_array = []
    
    for line in open(path, "r", encoding = "UTF-8"):
        
        line = clean_line(line)
        line_array = line.split("->")
        
        parent = line_array[0]
        parent_array = parent.split(",")
        parent_user_id = parent_array[0]
        parent_tweet_id = parent_array[1] 
        
        child = line_array[1]
        child_array = child.split(",")
        child_user_id = child_array[0]
        child_tweet_id = child_array[1] 
        child_time_delay = str(float(child_array[2]))
        time_array.append(float(child_time_delay))
        
        if parent_user_id == "ROOT":
            
            source_claim = {"user_id" : child_user_id, "tweet_id" : child_tweet_id, "time_delay" : child_time_delay}
        
        else:
            
            data_child_array.append({"user_id" : child_user_id, "tweet_id" : child_tweet_id, "time_delay" : child_time_delay})
        
    return data_child_array, time_array, source_claim


### Processing the tweets

In [21]:
def process_tweets(id_, data_child_array, source_claim, source_tweets, labels_dict):
    
    user_id_array = []
    tweet_id_array = []
    tweet_array = []
    time_delay_array = []
    
    missing_count = 0  
    
    label = labels_dict[id_]
    
    # Getting the tweet of the source claim 
    try:
        source_tweet_id = str(source_claim["tweet_id"]).strip()
        source_claim_tweet = source_tweets[source_tweet_id]
        source_claim["tweet"] = source_claim_tweet
    
    except:
        
        return None, None, None, None, None, None, None
    
    for item in data_child_array:
        
        try:

            user_id = item["user_id"]
            
            tweet_id = str(item["tweet_id"]).strip()
            time_delay = float(item["time_delay"].strip())
            
            tweet = source_tweets[tweet_id]

            user_id_array.append(user_id)
            tweet_id_array.append(tweet_id)
            tweet_array.append(tweet)
            time_delay_array.append(time_delay)
        
        except:
            missing_count += 1
            continue
            
    assert len(user_id_array) == len(tweet_id_array), print("Length of arrays DO NOT match")
    assert len(user_id_array) == len(tweet_array), print("Length of arrays DO NOT match")
    assert len(user_id_array) == len(time_delay_array), print("Length of arrays DO NOT match")
    
    return label, source_claim, user_id_array, tweet_id_array, tweet_array, time_delay_array, missing_count
    

## Main

In [None]:
if __name__ == "__main__":
    
    year_data = {}
    
    # To process data for each year (Twitter15, Twitter16)    
    for YEAR in [15, 16]:
        
        missing_claims_count = 0
        
        negative_delay_records = []
        
        print("Processing year '{} now".format(YEAR))

        source_tweets_path = "../data/controversy/raw_data/tweet_details.json"
        tweets_label_path = "../data/controversy/raw_data/twitter{}/label.txt".format(YEAR)
        raw_tree_folder = "../data/controversy/raw_data/twitter{}/tree".format(YEAR)

        output_file_path = "../data/controversy/processed_data/linear_structure/twitter{}/full_data/compiled_data.json".format(YEAR)

        labels = read_labels(tweets_label_path)
        source_tweets = read_source_tweets(source_tweets_path)
        
        compiled_data = []
        num_tweets = []
        delay_array = []

        for file in tqdm(os.listdir(raw_tree_folder)):

            id_ = file.replace(".txt", "")
            file = os.path.join(raw_tree_folder, file)
            
            data_child_array, time_array, source_claim = read_data(file)

            label, source_claim, user_id_array, tweet_id_array, tweet_array, time_delay_array, missing_count = process_tweets(id_, data_child_array, source_claim, source_tweets, labels)
            
            if label is None:
                missing_claims_count += 1
                continue 
            
            if len(tweet_array) != 0 :
                
                compiled_data.append({"id_" : id_,
                                      "claim" : source_claim,
                                      "label" : label,
                                      "tweet_ids" : tweet_id_array,
                                      "tweets" : tweet_array, 
                                      "time_delay" : time_delay_array})

                num_tweets.append(len(tweet_array))
                delay_array.extend(time_delay_array)
        
        print("There are {} missing claims in twitter {}".format(missing_claims_count, YEAR))
        print("There are {} claims in twitter {}".format(len(compiled_data), YEAR))
        write_jsonl(compiled_data, output_file_path)
        