In [14]:
import bz2, os, sys, glob
import json, csv, re, datetime
import pickle
from joblib import Parallel, delayed
from tqdm import tqdm

### Dump into daily tweets data(pickle): vid -> tweetCounts

In [2]:
def bz2_csv_rows(fp):
    with bz2.open(fp, mode='rt') as bzfp:
        for line in tqdm(bzfp, desc="hugefile"):
            sp = line.split(',')
            yield sp
            
# time: 3945000it [02:47, 23483.87it/s]
# Size: 17.4MB
def read_write_file(infile, outfile):
    """
    {
        vid_1: tweetCounts,
        vid_2: tweetCounts,
        ...
    }
    """
    map_vid_tweetCounts = {}
    all_vids = []    
    all_dates = []

    counts = 0    
    for row in bz2_csv_rows(infile):
        counts += 1
        # TODO: test only
        if int(counts / 100000) == 1:
            break
            
        # ignore rate message rows
        if len(row) < 10:
            continue
        
        # get the date (yyyy-mm-dd)
        date = row[1].strip()

        vids = []
        # single vid is of length 11
        original_vids = row[7].strip()
        if original_vids != 'N':
            vids.extend(original_vids.split(";"))
        retweeted_vids = row[8].strip()
        if retweeted_vids != 'N':
            vids.extend(retweeted_vids.split(";"))
        quoted_vids = row[9].strip()
        if quoted_vids != 'N':
            vids.extend(quoted_vids.split(";"))
        
        for vid in vids:
            if vid not in map_vid_tweetCounts:
                map_vid_tweetCounts[vid] = 0
            map_vid_tweetCounts[vid] += 1
        
        all_vids.append(vids)
        all_dates.append(date)
    
    if outfile is not None:
        pickle.dump(map_vid_tweetCounts, open(outfile, "wb"))

In [3]:
def dirtodir_read_write(indir, outdir, date_range):
    ap = {
        "indir": indir,
        "outdir": outdir,        
    }

    date_start = datetime.datetime.strptime(date_range[0], "%Y-%m-%d")
    date_end   = datetime.datetime.strptime(date_range[1], "%Y-%m-%d")
    
    res = []
    for infile in glob.glob(indir + "/*.bz2"):
        date_str = os.path.split(infile)[1].split(".")[0]
        date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
        if date >= date_start and date <= date_end:
            outfile = os.path.join(outdir, date_str + ".pik")
            res.append((infile, outfile))

    Parallel(n_jobs=5)(delayed(read_write_file)(x[0], x[1]) for x in res[:5])

In [4]:
tweets_proc_dir = "/data4/u5941758/yt_tweets_2015_2019/tweet_stats"
output_dir      = "./output"

example_in_path  = os.path.join(tweets_proc_dir, "2016-07-01.bz2")
example_out_path = os.path.join(output_dir, "2016-06-30.pik")

In [7]:
# read_write_file(example_in_path, example_out_path)
dirtodir_read_write(tweets_proc_dir, output_dir, ("2016-07-02", "2016-09-01"))

### Read dumped daily data and integrate to get final output
#### Final output
```
# category.pik/json
{
    vid_1: {
        day_zero: 2016-06-30,
        days: [],
        tweets: []
    },
    vid_2: {
        day_zero: 2016-06-30,
        days: [],
        tweets: []
    }...
}
```

In [16]:
def read_vids(inpath):
    res = dict()
    with open(inpath, 'r', encoding="utf-8") as file:
        reader = csv.reader(file)
        for row in reader:
            if len(row)==0: continue
            category = row[0]
            vids = row[1:]
            res[category] = set(vids)
    return res

In [18]:
engage_vids_path = "../data/engage16/filtered/vids_filter(all).csv"
map_category_vids = read_vids(engage_vids_path)

In [None]:
def find_category(vid, map_category_vids):
    for cat, vid_set in map_category_vids.items():
        if vid in vid_set:
            return cat
    return None

def build_final(indir, vids_path):
    dataset = dict()
    
    # engage vids
    map_category_vids = read_vids(vids_path)
    
    for infile in glob.glob(indir + "/*.bz2"):
        date_str = os.path.split(infile)[1].split(".")[0]
        date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
        date_aligned = date - datetime.timedelta(days=1)
        
        daily_tweets = pickle.load(open(infile, "rb"))
        for vid, tweetCounts in daily_tweets.items():
            cat = find_category(vid, map_category_vids) 
            if cat is None: continue
            if cat not in dataset:
                dataset[cat] = dict()
            else:
                # TODO