In [None]:
# This notebook requires Nvidia rapids to run.
# A container for running this notebook can be created using docker on a linux 
# machine with the following command:
# docker run --gpus all -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --mount type=bind,source=/home/ubuntu/DATA_FOLDER,target=/rapids/notebooks/host     rapidsai/rapidsai:0.19-cuda11.0-runtime-ubuntu18.04-py3.7
# where DATA_FOLDER contains the folder structure created with the previous scripts.
# Now jupiter is running in the container on port 8888 and the notebook can be executed from there;
# DATA_FOLDER is bind mounted to the ./host directory in the container.

import os, time
import json
from numba import cuda 
# Set according to the available GPUs (at least one)
os.environ["CUDA_VISIBLE_DEVICES"]= "0,1,2,3"
start = time.time()
very_start = time.time()

#import pandas as pd, 
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.max_rows', 500)
import cudf, cupy, time, rmm

import dask as dask, dask_cudf
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
import subprocess

cluster = LocalCUDACluster()
client = Client(cluster)
#client = Client(processes=False)

In [None]:
!nvidia-smi

Sun Jun 20 08:28:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.03   Driver Version: 450.119.03   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:1B.0 Off |                    0 |
| N/A   39C    P0    26W /  70W |    201MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            On   | 00000000:00:1C.0 Off |                    0 |
| N/A   38C    P0    26W /  70W |    102MiB / 15109MiB |      0%      Default |
|       

In [None]:
class MTE_one_shot:
    
    def __init__(self, folds, smooth, seed=42, mode='gpu'):
        self.folds = folds
        self.seed = seed
        self.smooth = smooth
        if mode=='gpu':
            self.np = cupy
            self.df = cudf
        else:
            self.np = np
            self.df = pd
        self.mode = mode
        
    def fit_transform(self, train, x_col, y_col, y_mean=None, out_col = None, out_dtype=None):
        
        self.x_col = x_col
        self.y_col = y_col
        self.np.random.seed(self.seed)
        
        if 'fold' not in train.columns:
            fsize = len(train)//self.folds
            if isinstance(train,dask_cudf.core.DataFrame):
                #train['fold'] = train.map_partitions(lambda cudf_df: cudf_df.index%self.folds)
                train['fold'] = 1
                train['fold'] = train['fold'].cumsum()
                train['fold'] = train['fold']//fsize
                train['fold'] = train['fold']%self.folds
            else:
                #train['fold'] = self.np.random.randint(0,self.folds,len(train))
                train['fold'] = (train.index.values//fsize)%self.folds
        
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        
        if y_mean is None:
            y_mean = train[y_col].mean()#.compute().astype('float32')
        self.mean = y_mean
        
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        
        agg_each_fold = train.groupby(cols).agg({y_col:['count','sum']}).reset_index()
        agg_each_fold.columns = cols + ['count_y','sum_y']
        
        agg_all = agg_each_fold.groupby(x_col).agg({'count_y':'sum','sum_y':'sum'}).reset_index()
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all.columns = cols + ['count_y_all','sum_y_all']
        
        agg_each_fold = agg_each_fold.merge(agg_all,on=x_col,how='left')
        agg_each_fold['count_y_all'] = agg_each_fold['count_y_all'] - agg_each_fold['count_y']
        agg_each_fold['sum_y_all'] = agg_each_fold['sum_y_all'] - agg_each_fold['sum_y']
        agg_each_fold[out_col] = (agg_each_fold['sum_y_all']+self.smooth*self.mean)/(agg_each_fold['count_y_all']+self.smooth)
        agg_each_fold = agg_each_fold.drop(['count_y_all','count_y','sum_y_all','sum_y'],axis=1)
        
        agg_all[out_col] = (agg_all['sum_y_all']+self.smooth*self.mean)/(agg_all['count_y_all']+self.smooth)
        agg_all = agg_all.drop(['count_y_all','sum_y_all'],axis=1)
        self.agg_all = agg_all
        
        train.columns
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        train = train.merge(agg_each_fold,on=cols,how='left')
        #del agg_each_fold
        self.agg_each_fold = agg_each_fold
        if self.mode=='gpu':
            if isinstance(train,dask_cudf.core.DataFrame):
                train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
            else:
                train[out_col] = train[out_col].nans_to_nulls()
        train[out_col] = train[out_col].fillna(self.mean)
        
        if out_dtype is not None:
            train[out_col] = train[out_col].astype(out_dtype)
        
        return train
    
    def transform(self, test, x_col, out_col = None, out_dtype=None):
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        test = test.merge(self.agg_all,on=x_col,how='left')
        test[out_col] = test[out_col].fillna(self.mean)
        if out_dtype is not None:
            test[out_col] = test[out_col].astype(out_dtype)
        return test

In [None]:
import gc

def addMeta(meta, feature, n):
    for i in range(1, n + 1):
        meta[feature + '_' + str(i)] = 'int32'
    return meta

def splitListFeature(df, columns, max):
    for col in columns:
        df[col] = df[col].applymap(lambda x: x[0:max-1] if len(x) > max else x)
        cols = []
        for i in range(1, max+1):
            cols.append(col + '_' + str(i))
        df[cols] = pd.DataFrame(
            df[col].tolist(),
            df[col].index, dtype=object
        ).fillna(0).astype('int32')
    return df

def convertTarget(df, t):
    df[t] = df[t].applymap(lambda x: 1 if x > 0 else 0)
    return df

# All the folder structure created by previous scripts is bind mounted in the folder ./host
BASE_DIR = './host/Preprocessed/Train/FeatureExtraction/Temp/Columns/Split_cols'
DICT_DIR_ALL = './host/Preprocessed/Train/FeatureExtraction/Dictionary/'
paths = [os.path.join(BASE_DIR, f) for f in os.listdir(BASE_DIR) if 'parquet' in f]
dic_paths = dict(zip(range(1, len(paths) + 1), paths))
# In case of out of memory errors, try to reduce n_chunks_one_shot
n_chunks_one_shot = 150
for t in ['engagement_reply', 'engagement_retweet', 'engagement_comment', 'engagement_like']:
    # Modify here to create different TE dictionaries. Single feature or list of features are supported
    for c in [['mapped_engager_id','text_kpop_bool'], 'mapped_engager_id', ['mapped_engager_id','mapped_tweet_links_id_1'], 'number_of_photo', 'number_of_gif', 'number_of_video', 'mapped_tweet_type', 'mapped_language_id',
              'mapped_creator_id', 'mapped_tweet_links_id_1', 'mapped_tweet_links_id_2',
                'mapped_tweet_hashtags_id_1', 'mapped_tweet_hashtags_id_2', 'mapped_domains_id_1', 'mapped_domains_id_2',
                ['mapped_domains_id_1', 'mapped_language_id', 'engagement_creator_follows_engager', 'mapped_tweet_type', 'number_of_photo', 'creator_is_verified'],
                 'tweet_links_count',	'tweet_domains_count',	'tweet_hashtags_count',	'tweet_hashtags_unique_count',
                	['mapped_engager_id','text_is_reply'],['mapped_engager_id','text_nsfw_bool'],	
              	 ['mapped_engager_id','text_covid_bool'], 	 ['mapped_engager_id','text_sports_bool'],
                  ]:
        out_col = f'TE_{c}_{t}'.replace('[', '(').replace(']', ')')
        cols_to_load = [t + '_timestamp', 'tweet_timestamp']
        i = 1
        n_passes = 0
        mean = 0
        tmp_paths = []
        means_list = []
        dic = {}
        dic_count = {}
        dic_all = {}
        while i <= len(paths):
            tmp_paths.append(dic_paths[i])
            if i % n_chunks_one_shot == 0 or i == len(paths):
                #print('Chunck paths:', tmp_paths)
                print('Feature:', out_col + '_' + str(i))
                start_chunk = time.time()
                flag = isinstance(c, list)
                n_passes = n_passes + 1
                if flag:
                    cols_to_load = cols_to_load + c
                else:
                    cols_to_load.append(c)
                train = dask_cudf.read_parquet(tmp_paths, columns=cols_to_load)
                #if c == 'mapped_engager_id':
                #    train = train.repartition(npartitions=1000)
                #train['idx'] = 1
                #train['idx'] = train.idx.cumsum()
                #trin = train.compute()
                #train = train.sort_values('tweet_timestamp').reset_index(drop=True)

                train = train.rename(columns={t + '_timestamp': t})
                meta = {k: train.dtypes[k] for k in train}
                train = train.map_partitions(convertTarget, t, meta=meta)

                #train, = dask.persist(train)
                #train = train.repartition(npartitions=4)
                #train, = dask.persist(train)         
                encoder = MTE_one_shot(folds=5,smooth=20,mode='gpu')
                train = encoder.fit_transform(train, c, t, out_col=out_col, out_dtype='float32')

                #to_drop = [c for c in train.columns if 'TE' not in c and 'idx' not in c]
                #keep = [c for c in train.columns if c not in to_drop]
                #print('Cols saved: ', keep)
                #train = train.drop(to_drop, axis=1)
                #train.to_parquet(OUT_DIR + out_col)

                dft = encoder.agg_all.compute().to_pandas()
                dft = dft.rename(columns={out_col: 'TE'})
                dft.set_index(encoder.x_col, drop=True, inplace=True)
                
                dft['index'] = dft.index
                vals = dft.values
                idx = dft.columns.get_loc("index")
                te = dft.columns.get_loc("TE")
                o = 0
                l = 0
                for row in range(dft.shape[0]):
                    if flag:
                        k = '_'.join(str(z) for z in list(vals[row, idx]))
                    else: 
                        k = int(vals[row, idx])
                    v = vals[row, te]
                    if k in dic_all:
                        o = o + 1
                        dic_all[k] = dic_all[k] + v
                        dic_count[k] = dic_count[k] + 1
                    else:
                        l = l + 1
                        dic_all[k] = v
                        dic_count[k] = 1


                c_mean = encoder.mean.compute().astype('float32')
                mean = mean + c_mean
                #means_list.append(mean)
                print('Seen: ' + str(o) + ', New: ' + str(l))
                print('Total dict size:', len(dic_all))
                print('Average mean:', mean / n_passes)
                print('Chunk mean:', c_mean)
                print('Chunk time:', time.time() - start_chunk)
                print('##########')
                del dft
                del encoder
                del train
                tmp_paths = []
                #gc.collect()
                #train, = dask.persist(train)
                #train.head()
            i = i + 1
        for k, v in dic_all.items():
            dic_all[k] = dic_all[k] / dic_count[k]
        dic_all['$mean'] = mean / n_passes
        with open(DICT_DIR_ALL + out_col, 'w') as f:
            for chunk in json.JSONEncoder().iterencode(dic_all):
                f.write(chunk)

Feature: TE_('mapped_engager_id', 'text_kpop_bool')_engagement_reply_150




Seen: 0, New: 29281190
Total dict size: 29281190
Average mean: 0.02672302909195423
Chunk mean: 0.02672303
Chunk time: 134.06127429008484
##########
Feature: TE_('mapped_engager_id', 'text_kpop_bool')_engagement_reply_300
Seen: 22547316, New: 5964335
Total dict size: 35245525
Average mean: 0.026832688599824905
Chunk mean: 0.026942348
Chunk time: 131.3266396522522
##########
Feature: TE_mapped_engager_id_engagement_reply_150
Seen: 0, New: 27806304
Total dict size: 27806304
Average mean: 0.02672302909195423
Chunk mean: 0.02672303
Chunk time: 61.31066846847534
##########
Feature: TE_mapped_engager_id_engagement_reply_300
Seen: 22007093, New: 5121960
Total dict size: 32928264
Average mean: 0.026832688599824905
Chunk mean: 0.026942348
Chunk time: 65.83862543106079
##########
Feature: TE_('mapped_engager_id', 'mapped_tweet_links_id_1')_engagement_reply_150
Seen: 0, New: 58675647
Total dict size: 58675647
Average mean: 0.02672302909195423
Chunk mean: 0.02672303
Chunk time: 233.9912188053131
##



Seen: 22733587, New: 6632789
Total dict size: 36833821
Average mean: 0.026832688599824905
Chunk mean: 0.026942348
Chunk time: 136.5160310268402
##########
Feature: TE_('mapped_engager_id', 'text_nsfw_bool')_engagement_reply_150
Seen: 0, New: 30472068
Total dict size: 30472068
Average mean: 0.02672302909195423
Chunk mean: 0.02672303
Chunk time: 134.37308526039124
##########
Feature: TE_('mapped_engager_id', 'text_nsfw_bool')_engagement_reply_300
Seen: 22800323, New: 6811648
Total dict size: 37283716
Average mean: 0.026832688599824905
Chunk mean: 0.026942348
Chunk time: 135.33019733428955
##########
Feature: TE_('mapped_engager_id', 'text_covid_bool')_engagement_reply_150
Seen: 0, New: 30439236
Total dict size: 30439236
Average mean: 0.02672302909195423
Chunk mean: 0.02672303
Chunk time: 134.2287561893463
##########
Feature: TE_('mapped_engager_id', 'text_covid_bool')_engagement_reply_300
Seen: 22816314, New: 6784907
Total dict size: 37224143
Average mean: 0.026832688599824905
Chunk mean



Seen: 20824889, New: 35104286
Total dict size: 93779933
Average mean: 0.08650819957256317
Chunk mean: 0.086608686
Chunk time: 248.33688378334045
##########
Feature: TE_number_of_photo_engagement_retweet_150
Seen: 0, New: 5
Total dict size: 5
Average mean: 0.08640771359205246
Chunk mean: 0.08640771
Chunk time: 19.21326780319214
##########
Feature: TE_number_of_photo_engagement_retweet_300
Seen: 5, New: 0
Total dict size: 5
Average mean: 0.08650819957256317
Chunk mean: 0.086608686
Chunk time: 11.267470359802246
##########
Feature: TE_number_of_gif_engagement_retweet_150
Seen: 0, New: 4
Total dict size: 4
Average mean: 0.08640771359205246
Chunk mean: 0.08640771
Chunk time: 10.815315246582031
##########
Feature: TE_number_of_gif_engagement_retweet_300
Seen: 3, New: 0
Total dict size: 4
Average mean: 0.08650819957256317
Chunk mean: 0.086608686
Chunk time: 10.674477577209473
##########
Feature: TE_number_of_video_engagement_retweet_150
Seen: 0, New: 5
Total dict size: 5
Average mean: 0.08640