In [None]:
# Install RAPIDS (takes ~10 min).
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh 0.18

import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

# https://github.com/NVIDIA/NVTabular/blob/main/examples/winning-solution-recsys2020-twitter/01-02-04-Download-Convert-ETL-with-NVTabular-Training-with-XGBoost.ipynb     

In [None]:
# Needed to fix conda and install nvtabular.
!conda install https://repo.anaconda.com/pkgs/main/linux-64/conda-4.9.2-py37h06a4308_0.tar.bz2
!pip install git+https://github.com/NVIDIA/NVTabular.git@main

In [None]:
# For rapidsai 0.19 ONLY, not working.
"""
!sudo add-apt-repository ppa:ubuntu-toolchain-r/test

!sudo apt-get update

!sudo apt-get install gcc-4.9

!sudo apt-get upgrade libstdc++6
!sudo apt-get dist-upgrade
!strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | grep GLIBCXX"""

In [None]:
# External Dependencies
import time
import glob
import gc

import cupy as cp          # CuPy is an implementation of NumPy-compatible multi-dimensional array on GPU
import cudf                # cuDF is an implementation of Pandas-like Dataframe on GPU
import rmm                 # library for pre-allocating memory on GPU
import dask                # dask is an open-source library to nateively scale Python on multiple workers/nodes
import dask_cudf           # dask_cudf uses dask to scale cuDF dataframes on multiple workers/nodes

import numpy as np
# NVTabular is the core library, we will use here for feature engineering/preprocessing on GPU
import nvtabular as nvt
import xgboost as xgb

# More dask / dask_cluster related libraries to scale NVTabular
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from dask.distributed import wait
from dask.utils import parse_bytes
from dask.delayed import delayed
from nvtabular.utils import device_mem_size
from nvtabular.column_group import ColumnGroup

In [None]:
!nvidia-smi

Sat Apr 10 17:48:25 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    28W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Assume dataset in MyDrive/RecSys2021
from google.colab import drive
drive.mount('/content/drive')

BASE_DIR = '/content/drive/MyDrive/RecSys2021/original_dataset'

Mounted at /content/drive


In [None]:
# One BIG loop

import sys
import json
import shutil

gc.collect()
if (os.path.isdir('preprocess_out_1/')):
      shutil.rmtree('preprocess_out_1/')
if (os.path.isdir('preprocess_out_2/')):
      shutil.rmtree('preprocess_out_2/')

features = [
    'text_tokens',    ###############
    'hashtags',       #Tweet Features
    'tweet_id',       #
    'media',          #
    'links',          #
    'domains',        #
    'tweet_type',     #
    'language',       #
    'timestamp',      ###############
    'a_user_id',              ###########################
    'a_follower_count',       #Engaged With User Features
    'a_following_count',      #
    'a_is_verified',          #
    'a_account_creation',     ###########################
    'b_user_id',              #######################
    'b_follower_count',       #Engaging User Features
    'b_following_count',      #
    'b_is_verified',          #
    'b_account_creation',     #######################
    'b_follows_a',    #################### Engagement Features
    'reply',          #Target Reply
    'retweet',        #Target Retweet    
    'retweet_comment',#Target Retweet with comment
    'like',           #Target Like
                      ####################
]


tweet_id_dict = {}
tweet_id_dict['d_size'] = 0
media_dict = {}
media_dict['d_size'] = 0
hashtag_dict = {}
hashtag_dict['d_size'] = 0
link_dict = {}
link_dict['d_size'] = 0
domain_dict = {}
domain_dict['d_size'] = 0
lang_dict = {}
lang_dict['d_size'] = 0
tweet_type_dict = {}
tweet_type_dict['d_size'] = 0
user_dict = {}
user_dict['d_size'] = 0


# Check if value already mapped, if not map it
def add_dict_media(el):
    if (len(el.split('$$$$')) == 2):
        return int(el.split('$$$$')[1])
    else:
        media_dict[el] = media_dict['d_size']
        media_dict['d_size'] = media_dict['d_size'] + 1
        return media_dict['d_size'] - 1
def add_dict_hashtag(el):
    if (len(el.split('$$$$')) == 2):
        return int(el.split('$$$$')[1])
    else:
        hashtag_dict[el] = hashtag_dict['d_size']
        hashtag_dict['d_size'] = hashtag_dict['d_size'] + 1
        return hashtag_dict['d_size'] - 1
def add_dict_link(el):
    if (len(el.split('$$$$')) == 2):
        return int(el.split('$$$$')[1])
    else:
        link_dict[el] = link_dict['d_size']
        link_dict['d_size'] = link_dict['d_size'] + 1
        return link_dict['d_size'] - 1
def add_dict_domain(el):
    if (len(el.split('$$$$')) == 2):
        return int(el.split('$$$$')[1])
    else:
        domain_dict[el] = domain_dict['d_size']
        domain_dict['d_size'] = domain_dict['d_size'] + 1
        return domain_dict['d_size'] - 1
def add_dict_lang(el):
    if (len(el.split('$$$$')) == 2):
        return int(el.split('$$$$')[1])
    else:
        lang_dict[el] = lang_dict['d_size']
        lang_dict['d_size'] = lang_dict['d_size'] + 1
        return lang_dict['d_size'] - 1
def add_dict_tweet_type(el):
    if (len(el.split('$$$$')) == 2):
        return int(el.split('$$$$')[1])
    else:
        tweet_type_dict[el] = tweet_type_dict['d_size']
        tweet_type_dict['d_size'] = tweet_type_dict['d_size'] + 1
        return tweet_type_dict['d_size'] - 1
def add_dict_tweet_id(el):
    if (len(el.split('$$$$')) == 2):
        return int(el.split('$$$$')[1])
    else:
        tweet_id_dict[el] = tweet_id_dict['d_size']
        tweet_id_dict['d_size'] = tweet_id_dict['d_size'] + 1
        return tweet_id_dict['d_size'] - 1
def add_dict_user(el):
    if (len(el.split('$$$$')) == 2):
        return int(el.split('$$$$')[1])
    else:
        user_dict[el] = user_dict['d_size']
        user_dict['d_size'] = user_dict['d_size'] + 1
        return user_dict['d_size'] - 1

# Change to map other features
# Note: only tweet_id need else, other features can be done one shot
def simple_map(df):
    if (j < end_dict):
        df['tweet_id'] = df['tweet_id'].map(lambda x: tweet_id_dict[x] if x in tweet_id_dict else add_dict_tweet_id(x))
    else:
        # When dictionary is full, don't add mappings, just replace if label already mapped
        df['tweet_id'] = df['tweet_id'].map(lambda x: str('value$$$$' + str(tweet_id_dict[x])) if x in tweet_id_dict else x)
    #df['media'] = df['media'].map(lambda x: media_dict[x] if x in media_dict else add_dict_media(x))
    #df['hashtags'] = df['hashtags'].map(lambda x: hashtag_dict[x] if x in hashtag_dict else add_dict_hashtag(x))
    #df['links'] = df['links'].map(lambda x:link_dict[x] if x in link_dict else add_dict_link(x))
    #df['domains'] = df['domains'].map(lambda x: domain_dict[x] if x in domain_dict else add_dict_domain(x))
    #df['language'] = df['language'].map(lambda x: lang_dict[x] if x in lang_dict else add_dict_lang(x))
    #df['tweet_type'] = df['tweet_type'].map(lambda x: tweet_type_dict[x] if x in tweet_type_dict else add_dict_tweet_type(x))
    #df['a_user_id'] = df['a_user_id'].map(lambda x: user_dict[x] if x in user_dict else add_dict_user(x))
    #df['b_user_id'] = df['b_user_id'].map(lambda x: user_dict[x] if x in user_dict else add_dict_user(x))
    return df


# GPU pipeline
# Splits the entries in media by \t and keeps only the first two values (if available)
def splitmedia(col):
    if col.shape[0] == 0:
        return(col)
    else:
        return(col.str.split('\t', expand=True)[0].fillna('') + '_' + col.str.split('\t', expand=True)[1].fillna(''))

# Counts the number of token in a column (e.g. how many hashtags are in a tweet)  
def count_token(col,token):
    not_null = col.isnull()==0
    return ((col.str.count(token)+1)*not_null).fillna(0)

# >> is an overloaded operator, it transforms columns in other columns applying functions to them
count_features = (
    nvt.ColumnGroup(['hashtags', 'domains', 'links', 'media']) >> (lambda col: count_token(col,'\t')) >> nvt.ops.Rename(postfix = '_count_t')
)

#split_media = nvt.ColumnGroup(['media']) >> (lambda col: splitmedia(col))
#split_media = nvt.ColumnGroup(['media']) >> (lambda col: splitmedia(col))

# Dataset does not fit in memory, cannot use handy Categorify :(
multihot_filled = ['media', 'hashtags', 'domains', 'links'] >> nvt.ops.FillMissing()
"""cat_features = (
    split_media + multihot_filled + ['language', 'tweet_type', 'tweet_id', 'a_user_id', 'b_user_id'] >> 
    nvt.ops.Categorify()
)"""

LABEL_COLUMNS = ['reply', 'retweet', 'retweet_comment', 'like'] 
label_name_feature = LABEL_COLUMNS >> nvt.ops.FillMissing()
labels = label_name_feature >> (lambda col: (col>0).astype('int8')) >> nvt.ops.Rename(postfix = '_engagement')

weekday = (
    nvt.ColumnGroup(['timestamp']) >> 
    (lambda col: cudf.to_datetime(col, unit='s').dt.weekday) >> 
    nvt.ops.Rename(postfix = '_wd')
)

datetime = nvt.ColumnGroup(['timestamp']) >> (lambda col: cudf.to_datetime(col.astype('int32'), unit='s'))
hour = datetime >> (lambda col: col.dt.hour) >> nvt.ops.Rename(postfix = '_hour')
minute = datetime >> (lambda col: col.dt.minute) >> nvt.ops.Rename(postfix = '_minute')
seconds = datetime >> (lambda col: col.dt.second) >> nvt.ops.Rename(postfix = '_second')

output = ['language', 'tweet_type', 'tweet_id', 'a_user_id', 'b_user_id']+multihot_filled+count_features+label_name_feature+weekday+labels+hour+minute+seconds

remaining_columns = [x for x in features if x not in (output.columns+['text_tokens'])]

proc = nvt.Workflow(output+remaining_columns)


# For all features except tweet_id, use parts = [253] (one shot)
BASE_DIR = '/content/drive/MyDrive/RecSys2021/original_dataset'
parts = [54, 122, 189, 253]
#parts = [253]
first_run = True
old_s = 0
for pt in parts:
    print('Inizio passaggio da ' + str(old_s))
    print('Nuova BASE_DIR: ' + BASE_DIR)
    dir1 = 'preprocess_split_final'
    dir2 = 'preprocess_split_final_tmp_' + str(pt)
    ends = []
    i = old_s
    end_dict = pt
    end = parts[-1]
    while (i < end):
        j = i
        data_parts = []
        ends = []
        # Select one part
        for j in range(j,j+1):
            if (j<10):
                ends.append('0000' + str(j))
            elif (j<100):
                ends.append('000' + str(j))
            else:
                ends.append('00' + str(j))
        print(ends)
        for file in os.listdir(BASE_DIR):
            if file.endswith(tuple(ends)):
                data_parts.append(os.path.join(BASE_DIR, file))

        # Preprocessing defined before only at first run on first feature
        if (old_s == 0 and first_run):
            trains_itrs = nvt.Dataset(data_parts, 
                                      header=None, 
                                      names=features, 
                                      engine='csv', 
                                      sep='\x01', 
                                      part_size='2GB')

            time_preproc_start = time.time()
            proc.fit(trains_itrs)
            time_preproc = time.time()-time_preproc_start
            time_preproc

            # We define the output datatypes for continuous columns to save memory. We can define the output datatypes as a dict and parse it to the to_parquet function
            dict_dtypes = {}
            for col in LABEL_COLUMNS + ['timestamp', 'a_follower_count', 
                                        'a_following_count', 'a_account_creation',
                                        'b_follower_count', 'b_following_count', 'b_account_creation']:
                dict_dtypes[col] = np.uint32

            time_preproc_start = time.time()
            proc.transform(trains_itrs).to_parquet(output_path='preprocess_out_1/', dtypes=dict_dtypes)
            time_preproc += time.time()-time_preproc_start
            time_preproc


        # Apply mappings with CPU
        if (old_s == 0 and first_run):
            df = dask.dataframe.read_parquet('preprocess_out_1/*.parquet')
        else:
            df = dask.dataframe.read_parquet(data_parts)
        if 'text_tokens' in list(df.columns):
            df = df.drop('text_tokens', axis=1)

        df2 = df.map_partitions(simple_map, meta=df)
        df2.to_parquet('preprocess_out_2/')


        # Final encoding in parquet using GPU, saves more space
        df_tmp = cudf.read_parquet('preprocess_out_2/*.parquet')
        all_input_columns = df_tmp.columns
        #print(all_input_columns)
        del df_tmp
        gc.collect()

        datetime = nvt.ColumnGroup(['timestamp']) >> (lambda col: cudf.to_datetime(col.astype('int32'), unit='s'))
        hour = datetime >> (lambda col: col.dt.hour) >> nvt.ops.Rename(postfix = '_hour')
        minute = datetime >> (lambda col: col.dt.minute) >> nvt.ops.Rename(postfix = '_minute')
        seconds = datetime >> (lambda col: col.dt.second) >> nvt.ops.Rename(postfix = '_second')
        output = hour+minute+seconds
        (output).graph
        remaining_columns = [x for x in all_input_columns if x not in (output.columns+['text_tokens'])]

        dict_dtypes = {}
        for col in LABEL_COLUMNS + ['tweet_id', 'timestamp', 'a_follower_count', 
                                'a_following_count', 'a_account_creation',
                                'b_follower_count', 'b_following_count', 'b_account_creation']:
            dict_dtypes[col] = np.uint32
        for col in ['reply_engagement', 'retweet_engagement', 'retweet_comment_engagement', 'like_engagement']:
            dict_dtypes[col] = np.uint8

        # We initialize our NVTabular workflow and add the "remaining" columns to it
        proc2 = nvt.Workflow(output+remaining_columns)

        train_dataset = nvt.Dataset(glob.glob('preprocess_out_2/*.parquet'), 
                                    engine='parquet', 
                                    part_size="2GB")

        proc2.fit(train_dataset)
        # If dictionary to big for ram, more passages are needed, so partially mapped files are saved in temporary folders
        if (j<end_dict):
            proc2.transform(train_dataset).to_parquet(output_path='/content/drive/MyDrive/RecSys2021/' + dir1, dtypes=dict_dtypes)
        else:
            proc2.transform(train_dataset).to_parquet(output_path='/content/drive/MyDrive/RecSys2021/' + dir2)
        
        if (os.path.isdir('preprocess_out_1/')):
            shutil.rmtree('preprocess_out_1/')
        if (os.path.isdir('preprocess_out_2/')): 
            shutil.rmtree('preprocess_out_2/')
        
        gc.collect
        i = i + 1

    print('Finito passaggio da ' + str(old_s))
    # Save dictionary
    with open('/content/drive/MyDrive/RecSys2021/dictionaries/tweet_id_dict_' + str(pt), 'w') as f:
        for chunk in json.JSONEncoder().iterencode(tweet_id_dict):
            f.write(chunk)
    # Reset dictionary
    old_size = tweet_id_dict['d_size']
    tweet_id_dict = {}
    tweet_id_dict['d_size'] = old_size + 1

    # If needed, initalize next run
    old_s = end_dict
    BASE_DIR = '/content/drive/MyDrive/RecSys2021/' + dir2
    j = end_dict
    # Rename files in temporary folder sequentially
    if (end_dict != parts[-1]):
        for file in os.listdir(BASE_DIR):
            if file.endswith('.parquet'):
                if (j<10):
                    new_t = '0000' + str(j)
                    new = os.path.join(BASE_DIR, new_t)
                elif (j<100):
                    new_t = '000' + str(j)
                    new = os.path.join(BASE_DIR, new_t)
                else:
                    new_t = '00' + str(j)
                    new = os.path.join(BASE_DIR, new_t)
                old = os.path.join(BASE_DIR, file)
                os.rename(old, new)
                j = j + 1

Inizio passaggio da 0
Nuova BASE_DIR: /content/drive/MyDrive/RecSys2021/original_dataset
['00000']
['00001']
['00002']
['00003']
['00004']
['00005']
['00006']
['00007']
['00008']
['00009']
['00010']
['00011']
['00012']
['00013']
['00014']
['00015']
['00016']
['00017']
['00018']
['00019']
['00020']
['00021']
['00022']
['00023']
['00024']
['00025']
['00026']
['00027']
['00028']
['00029']
['00030']
['00031']
['00032']
['00033']
['00034']
['00035']
['00036']
['00037']
['00038']
['00039']
['00040']
['00041']
['00042']
['00043']
['00044']
['00045']
['00046']
['00047']
['00048']
['00049']
['00050']
['00051']
['00052']
['00053']
['00054']
['00055']
['00056']
['00057']
['00058']
['00059']
['00060']
['00061']
['00062']
['00063']
['00064']
['00065']
['00066']
['00067']
['00068']
['00069']
['00070']
['00071']
['00072']
['00073']
['00074']
['00075']
['00076']
['00077']
['00078']
['00079']
['00080']
['00081']
['00082']
['00083']
['00084']
['00085']
['00086']
['00087']
['00088']
['00089']
['00090']
[

In [None]:
# Check 
import pandas as pd
df = dask.dataframe.read_parquet('/content/drive/MyDrive/RecSys2021/preprocess_split_final/0.735b0fb3001b48018cb6c91437893999.parquet')
pd.set_option('display.max_rows', 1000)
#df.head(1000)
print(tweet_id_dict['d_size'])


301421584


In [None]:
# Get real size of objects
def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size

In [None]:
# To avoid OOM, write dictionary to file in parts
import json

with open('/content/drive/MyDrive/RecSys2021/dictionaries/tweet_id_dict_1', 'w') as f:
    for chunk in json.JSONEncoder().iterencode(tweet_id_dict):
        f.write(chunk)

In [None]:
# To avoid OOM, read dictionary from file in parts
!pip install ijson

import ijson

def parse_json(json_filename, dic):
    with open(json_filename, 'rb') as input_file:
        # load json iteratively
        parser = ijson.parse(input_file)
        for prefix, event, value in parser:
            if (prefix):
                dic[prefix] = value
        return dic

tweet_id_dict = {}
tweet_id_dict = parse_json('/content/drive/MyDrive/RecSys2021/dictionaries/tweet_id_dict_1', tweet_id_dict)



In [None]:
# Rename files in folder
import os

j = 0
for file in os.listdir(BASE_DIR + '_preprocess_split_final_pt2/'):
    if file.endswith('.parquet'):
        if (j<10):
            new_t = '0000' + str(j) + '.parquet'
            new = os.path.join(BASE_DIR_2, new_t)
        elif (j<100):
            new_t = '000' + str(j) + '.parquet'
            new = os.path.join(BASE_DIR_2, new_t)
        else:
            new_t = '00' + str(j) + '.parquet'
            new = os.path.join(BASE_DIR_2, new_t)
        old = os.path.join(BASE_DIR_2, file)
        os.rename(old, new)
        j = j + 1