Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline  

import pandas as pd
import numpy as np
import gc
from tqdm.notebook import tqdm

from transformers import *
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

In [2]:
%%time
df = pd.read_csv('../input/training.tsv', sep='\x01', header=None, usecols=[0] )
df.columns = ['text_tokens']
df['id']   = np.arange( df.shape[0] )
df['id']   = df['id'].astype(np.uint32)
gc.collect()
df.head()

CPU times: user 7min 35s, sys: 39.7 s, total: 8min 15s
Wall time: 9min 36s


Unnamed: 0,text_tokens,id
0,101\t6417\t3410\t3398\t3184\t1909\t56910\t1683...,0
1,101\t14120\t131\t120\t120\t188\t119\t11170\t12...,1
2,101\t62342\t10858\t54439\t19571\t22480\t7831\t...,2
3,101\t58955\t10898\t103305\t1901\t16181\t7168\t...,3
4,101\t2435\t5656\t2594\t8279\t8623\t1925\t64126...,4


In [3]:
%%time
dv = pd.read_csv('../input/val.tsv', sep='\x01', header=None, usecols=[0] )
dv.columns = ['text_tokens']
dv['id']   = np.arange( df.shape[0] , df.shape[0]+dv.shape[0] )
dv['id']   = dv['id'].astype(np.uint32)
gc.collect()
dv.head()

CPU times: user 47.5 s, sys: 4.09 s, total: 51.6 s
Wall time: 57.5 s


Unnamed: 0,text_tokens,id
0,101\t47185\t10157\t100986\t10343\t55422\t119\t...,148075238
1,101\t6006\t5086\t1939\t7418\t3601\t6406\t1913\...,148075239
2,101\t56898\t137\t44851\t10317\t11490\t10112\t1...,148075240
3,101\t13497\t10437\t94005\t11161\t73632\t11067\...,148075241
4,101\t24781\t10152\t42041\t38268\t10301\t10798\...,148075242


In [4]:
%%time
dt = pd.read_csv('../input/competition_test.tsv', sep='\x01', header=None, usecols=[0] )
dt.columns = ['text_tokens']
dt['id']   = np.arange( df.shape[0]+dv.shape[0], df.shape[0]+dv.shape[0]+dt.shape[0] )
dt['id']   = dt['id'].astype(np.uint32)
gc.collect()
dt.head()

CPU times: user 36.6 s, sys: 2.46 s, total: 39 s
Wall time: 35.2 s


Unnamed: 0,text_tokens,id
0,101\t3100\t5477\t3028\t4348\t1924\t111806\t186...,163202922
1,101\t56898\t137\t36110\t10400\t168\t64062\t131...,163202923
2,101\t56898\t137\t179\t36816\t10775\t40546\t513...,163202924
3,101\t56898\t137\t22038\t40663\t12892\t45389\t1...,163202925
4,101\t56898\t137\t11699\t10174\t10738\t37816\t1...,163202926


In [10]:
%%time
df['tweet'] = [ tokenizer.decode( [ int(n) for n in t.split('\t') ] ) for t in tqdm(df.text_tokens.values) ] 
gc.collect()

HBox(children=(FloatProgress(value=0.0, max=148075238.0), HTML(value='')))


CPU times: user 1h 46min 30s, sys: 1min 9s, total: 1h 47min 40s
Wall time: 1h 46min 36s


63

In [5]:
%%time
dv['tweet'] = [ tokenizer.decode( [ int(n) for n in t.split('\t') ] ) for t in tqdm(dv.text_tokens.values) ] 
gc.collect()

HBox(children=(FloatProgress(value=0.0, max=15127684.0), HTML(value='')))


CPU times: user 10min 47s, sys: 6.55 s, total: 10min 54s
Wall time: 10min 47s


3

In [6]:
%%time
dt['tweet'] = [ tokenizer.decode( [ int(n) for n in t.split('\t') ] ) for t in tqdm(dt.text_tokens.values) ] 
gc.collect()

HBox(children=(FloatProgress(value=0.0, max=12434838.0), HTML(value='')))


CPU times: user 8min 56s, sys: 5.64 s, total: 9min 2s
Wall time: 8min 56s


23

In [11]:
%%time

df['tweet'] = df['tweet'].apply( lambda x: x.replace('https : / / t. co / ', 'https://t.co/') )
df['tweet'] = df['tweet'].apply( lambda x: x.replace('@ ', '@') )
df.head()

CPU times: user 4min 17s, sys: 7.84 s, total: 4min 25s
Wall time: 4min 24s


Unnamed: 0,text_tokens,id,tweet
0,101\t6417\t3410\t3398\t3184\t1909\t56910\t1683...,0,[CLS] 美 容 室 変 えどきかな 〜 [UNK] [SEP]
1,101\t14120\t131\t120\t120\t188\t119\t11170\t12...,1,[CLS] https://t.co/jbcBe1B5lP [SEP]
2,101\t62342\t10858\t54439\t19571\t22480\t7831\t...,2,[CLS] SNCタルコフ 部 門 企 業 説 明 会 始 めます. https://t.c...
3,101\t58955\t10898\t103305\t1901\t16181\t7168\t...,3,[CLS] ありがとう 〜 でも 言 いたい 事 を 言 いたい 時 に 言 っているだけな...
4,101\t2435\t5656\t2594\t8279\t8623\t1925\t64126...,4,[CLS] 免 疫 力 雑 魚 すぎるから 一 週 間 絶 対 外 でない [SEP]


In [12]:
%%time
dv['tweet'] = dv['tweet'].apply( lambda x: x.replace('https : / / t. co / ', 'https://t.co/') )
dv['tweet'] = dv['tweet'].apply( lambda x: x.replace('@ ', '@') )
dv.head()

CPU times: user 23.5 s, sys: 344 ms, total: 23.9 s
Wall time: 23.9 s


Unnamed: 0,text_tokens,id,tweet
0,101\t47185\t10157\t100986\t10343\t55422\t119\t...,148075238,[CLS] Funky techno Witch. https://t.co/YdfhIt7...
1,101\t6006\t5086\t1939\t7418\t3601\t6406\t1913\...,148075239,[CLS] 空 港 で 財 布 置 き 忘 れたら 偶 然 隣 座 ってた 方 がフォロワー...
2,101\t56898\t137\t44851\t10317\t11490\t10112\t1...,148075240,[CLS] RT @LegadoDeKonoha : Eis o verdadeiro si...
3,101\t13497\t10437\t94005\t11161\t73632\t11067\...,148075241,"[CLS] Para uma criança pequenina, que verá um ..."
4,101\t24781\t10152\t42041\t38268\t10301\t10798\...,148075242,[CLS] Why lesbian couples are more likely to d...


In [13]:
%%time
dt['tweet'] = dt['tweet'].apply( lambda x: x.replace('https : / / t. co / ', 'https://t.co/') )
dt['tweet'] = dt['tweet'].apply( lambda x: x.replace('@ ', '@') )

dt.head()

CPU times: user 21 s, sys: 276 ms, total: 21.2 s
Wall time: 21.2 s


Unnamed: 0,text_tokens,id,tweet
0,101\t3100\t5477\t3028\t4348\t1924\t111806\t186...,163202922,[CLS] 埼 玉 土 日 じゃん ！. 行 けないこともないか... ？ [UNK]. え...
1,101\t56898\t137\t36110\t10400\t168\t64062\t131...,163202923,[CLS] RT @meanie _ ark : いいね ・ RTでそれぞれポイントになるみ...
2,101\t56898\t137\t179\t36816\t10775\t40546\t513...,163202924,[CLS] RT @kwonjiyongbabe : this vip be hanging...
3,101\t56898\t137\t22038\t40663\t12892\t45389\t1...,163202925,[CLS] RT @massudessu13 : 너무좋아 [UNK].. # 박유천 ht...
4,101\t56898\t137\t11699\t10174\t10738\t37816\t1...,163202926,[CLS] RT @AtkArena : Welcome to our new CS : G...


In [14]:
df.to_parquet( 'train-tweet-1.parquet' )
dv.to_parquet( 'test0-tweet-1.parquet' )
dt.to_parquet( 'test1-tweet-1.parquet' )