## Import libraries

In [1]:
import os
import json
import glob
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import StratifiedShuffleSplit

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

import collections

import operator

## Helper functions

### Read index

In [2]:
def read_index(path):
    
    data = []
    
    for line in open(path, "r", encoding = "UTF-8"):
        
        line = line.replace("\n", "")
        data.append(line)
    
    return data

### Read data

In [3]:
def read_data(path):
    
    data = []
    
    for line in open(path, "r", encoding = "UTF-8"):
        data.append(json.loads(line))
        
    return data

### Get data

In [4]:
def get_data(full_data, index):
    
    sieved_data = []
    
    for item in full_data:
        
        if item["id_"] in index:
            sieved_data.append(item)
    
    return sieved_data

### Map time delay 

In [5]:
def map_time_bins(time):
    
    # Max is size -1 (Ie, max index is 99 if i set size to be 100) so I would have 100 unique index
    
    bin_num = min(int(time // float(INTERVAL)), SIZE - 1)
    bin_num = max(0, bin_num)
    
    return bin_num

### Get small data

In [6]:
def get_small_data(data, num_records = 32, num_tweets = 5):
    
    small_data = data[:num_records]
    
    for i in range(len(small_data)):
        
        small_data[i]["tweets"] = small_data[i]["tweets"][:num_tweets]
        small_data[i]["time_delay"] = small_data[i]["time_delay"][:num_tweets]
    
    return small_data

### Get time bins of unique posts in a claim 

In [4]:
def get_unique_posts_time_bin(data):
    
    new_data = []
    new_data_tweets_length_lst = []
    
    for record in data:
        
        time_delay = record["time_delay"]
        tweet_ids = record["tweet_ids"]
        tweets = record["tweets"]
        num_tweets = len(tweets)
        
        time_delay_new = []
        tweets_new = []
        
        time_delay_keys = list(set(time_delay))
        
        for key in time_delay_keys:
            
            idx = [i for i in range(num_tweets) if time_delay[i] == key]
            unique_tweet_ids_current = list(set([tweet_ids[i] for i in idx]))
            
            for id_ in unique_tweet_ids_current:
                
                tweets_new.append(tweets[tweet_ids.index(id_)])
                time_delay_new.append(key)
        
        new_data.append({"id_" : record["id_"],
                         "label" : record["label"],
                         "tweets" : tweets_new, 
                         "time_delay" : time_delay_new
                        })
        
        new_data_tweets_length_lst.append(len(tweets_new))
    
    return new_data, new_data_tweets_length_lst

### Get unique posts

In [5]:
def get_unique_posts(data):
    
    new_data = []
    new_data_tweets_length_lst = []

    for record in data:
        
        new_time_delay = []
        new_tweet_ids = []
        new_tweets = []
        
        source_claim = record["claim"]
        time_delay = record["time_delay"]
        tweet_ids = record["tweet_ids"]
        tweets = record["tweets"]

        unique_tweet_ids = list(set(tweet_ids))
        unique_tweet_ids = [id_ for id_ in unique_tweet_ids if id_ != source_claim["tweet_id"].strip()]
        
        for unique_id in unique_tweet_ids:
            
            idx = tweet_ids.index(unique_id)
            tweet = tweets[idx]
            relevant_time = [time_delay[i] for i in range(len(time_delay)) if tweet_ids[i] == unique_id]
            min_time = min(relevant_time)
            
            new_time_delay.append(min_time)
            new_tweet_ids.append(unique_id)
            new_tweets.append(tweet)
        
        # <--------- Sort according to time delay --------->
        sort_order = list(map(operator.itemgetter(0), sorted(enumerate(new_time_delay), key=operator.itemgetter(1))))
        
        new_time_delay = [new_time_delay[i] for i in sort_order]
        new_tweet_ids = [new_tweet_ids[i] for i in sort_order]
        new_tweets = [new_tweets[i] for i in sort_order]
        
        # <--------- Add in the source post --------->
        new_time_delay = [float(source_claim["time_delay"])] + new_time_delay
        new_time_delay = [map_time_bins(time) for time in new_time_delay]
        
        new_tweet_ids = [source_claim["tweet_id"]] + new_tweet_ids
        new_tweets = [source_claim["tweet"]] + new_tweets
        
        new_data.append({"id_" : record["id_"],
                         "label" : record["label"],
                         "tweets" : new_tweets, 
                         "time_delay" : new_time_delay
                        })

        new_data_tweets_length_lst.append(len(new_tweets))
    
    return new_data, new_data_tweets_length_lst
        

### Change label (Claims with only the source claim would be now unverified)

In [6]:
def change_labels(data):
    
    for i in range(len(data)):
        
        record = data[i]
        
        if len(record["time_delay"]) == 1 and record["label"] != 2:
            
            new_record = record
            new_record["label"] = 2 # Set as unverified 
            data[i] = new_record
        
    return data

### Split test data

In [7]:
def split_test_data(data):
    
    idx = []
    label = []
    
    for i in range(len(data)):
        
        idx.append(i)
        label.append(data[i]["label"])
    
    sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.5, random_state = 0)
    
    for idx_1, idx_2 in sss.split(idx, label):
        
        label_1 = [label[i] for i in idx_1]
        label_2 = [label[i] for i in idx_2]
        
        data_1 = [data[i] for i in idx_1]
        data_2 = [data[i] for i in idx_2]
        
        return data_1, data_2

### Write data

In [8]:
def write_data(data, path):
    
    with open(path, "w", encoding = "UTF-8") as f:
        
        for item in data:
            
            json.dump(item, f)
            f.write("\n")

### main function

In [9]:
def main(data, split_num, year):
    
    # Input file paths
    train_index_path = glob.glob("../data/controversy/raw_data/split/*trainSet_Twitter{}_{}*".format(year, split_num))[0]
    test_index_path = glob.glob("../data/controversy/raw_data/split/*testSet_Twitter{}_{}*".format(year, split_num))[0]
    
    # Output file paths
    train_data_output_path = "../data/controversy/processed_data/linear_structure/twitter{}/split_data/split_{}/train_unique.json".format(year, split_num)
    test_data_1_output_path = "../data/controversy/processed_data/linear_structure/twitter{}/split_data/split_{}/test_1_unique.json".format(year, split_num)
    test_data_2_output_path = "../data/controversy/processed_data/linear_structure/twitter{}/split_data/split_{}/test_2_unique.json".format(year, split_num)
    
    # Output file paths (small)
    train_data_output_small_path = "../data/controversy/processed_data/linear_structure/twitter{}/split_data/split_{}/train_small_unique.json".format(year, split_num)
    test_data_1_output_small_path = "../data/controversy/processed_data/linear_structure/twitter{}/split_data/split_{}/test_1_small_unique.json".format(year, split_num)
    test_data_2_output_small_path = "../data/controversy/processed_data/linear_structure/twitter{}/split_data/split_{}/test_2_small_unique.json".format(year, split_num)

    # Read data
    train_index = read_index(train_index_path)
    test_index = read_index(test_index_path)
    
    # get_data 
    train_data = get_data(data, train_index)
    test_data = get_data(data, test_index)
    
    # Get unique posts 
    train_data_unique, train_length_lst = get_unique_posts(train_data)
    test_data_unique, test_length_lst = get_unique_posts(test_data)
    
    print("Max training is: {}, Max testing is: {}".format(max(train_length_lst), min(train_length_lst)))
    print("Min training is: {}, Min testing is: {}".format(max(test_length_lst), min(test_length_lst)))
    print()
    
    # Changing the labels of the data 
    train_data_unique = change_labels(train_data_unique)
    test_data_unique = change_labels(test_data_unique)
        
    print("Training labels: {}".format(collections.Counter([item["label"] for item in train_data_unique])))
    print("Testing labels: {}".format(collections.Counter([item["label"] for item in test_data_unique])))
    print()

    # Split test_data into 2 sets 
    test_data_1, test_data_2 = split_test_data(test_data_unique)
    
    # write data
    write_data(train_data_unique, train_data_output_path)
    write_data(test_data_1, test_data_1_output_path)
    write_data(test_data_2, test_data_2_output_path)
    
    # write data (small)
    write_data(get_small_data(train_data_unique), train_data_output_small_path)
    write_data(get_small_data(test_data_1), test_data_1_output_small_path)
    write_data(get_small_data(test_data_2), test_data_2_output_small_path)
    


## Main

In [13]:
if __name__ == "__main__":
    
    # Settings
    SIZE = 100
    INTERVAL = 10
    years = [15, 16]
    
    for year in tqdm(years):
        
        compiled_data_path = "../data/controversy/processed_data/linear_structure/twitter{}/full_data/compiled_data.json".format(year)
        compiled_data = read_data(compiled_data_path)
        
        for i in tqdm(range(5)):
            print("Doing split {} for year {}".format(i, year))
            main(compiled_data, i, year)
            print("*" * 100)
            print()


