In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')  # required for word_tokenize
import tqdm
from scml.nlp import to_ascii_str, expand_contractions, strip_punctuation, count_digit
import mylib

[nltk_data] Downloading package punkt to /home/pankun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 1000)

In [3]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


In [4]:
text = ""
for t in train.itertuples():
    title = getattr(t, "title")
    text = f"{text} {title}"

In [5]:
%%time
text = text.lower()
text = to_ascii_str(text)
text = expand_contractions(text)
text = strip_punctuation(text)
text = nltk.word_tokenize(text)
text = [w for w in text if count_digit(w) == 0]

CPU times: user 3.45 s, sys: 15.6 ms, total: 3.47 s
Wall time: 3.49 s


In [6]:
print(f"text={text[:100]}")

text=['paper', 'bag', 'victoria', 'secret', 'double', 'tape', 'vhb', 'mm', 'x', 'm', 'original', 'double', 'foam', 'tape', 'maling', 'tts', 'canned', 'pork', 'luncheon', 'meat', 'gr', 'daster', 'batik', 'lengan', 'pendek', 'motif', 'acak', 'campur', 'leher', 'kancing', 'batik', 'karakter', 'alhadi', 'nescafe', 'latte', 'celana', 'wanita', 'bb', 'kgharem', 'wanita', 'bisa', 'cod', 'jubah', 'anak', 'size', 'thn', 'kulot', 'plisket', 'salur', 'candy', 'plisket', 'wish', 'kulot', 'premium', 'kulot', 'pelangi', 'premiumhieka', 'kulot', 'logu', 'tempelan', 'kulkas', 'magnet', 'angka', 'tempelan', 'angka', 'magnet', 'big', 'sale', 'sepatu', 'pantofel', 'kulit', 'keren', 'kerja', 'kantor', 'laki', 'pria', 'cowok', 'dinas', 'resmi', 'formal', 'pesta', 'kickers', 'atasan', 'rajut', 'wanita', 'lisdia', 'sweater', 'pashmina', 'kusut', 'rawis', 'polos', 'crinkle', 'shawl', 'murah', 'banget', 'pashmina', 'kusut', 'rawis', 'polos', 'crinkle']


In [7]:
nltk.download('brown')
nltk.download('reuters')
nltk.download('gutenberg')
nltk.download('webtext')
from nltk.corpus import brown, reuters, gutenberg, webtext
en_words = set(w.lower() for w in brown.words())
en_words |= set(w.lower() for w in reuters.words())
en_words |= set(w.lower() for w in gutenberg.words())
en_words |= set(w.lower() for w in webtext.words())
words = set(text)
id_words = words - en_words
print(f"len(id_words)={len(id_words)}")

[nltk_data] Downloading package brown to /home/pankun/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package reuters to /home/pankun/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package gutenberg to /home/pankun/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package webtext to /home/pankun/nltk_data...
[nltk_data]   Package webtext is already up-to-date!


len(id_words)=14955


In [8]:
tmp = [w for w in text if w in id_words]
fd = nltk.FreqDist(tmp)
print(fd)

<FreqDist with 14955 samples and 140662 outcomes>


In [9]:
mc = fd.most_common()
print(mc[:100])

[('wanita', 1844), ('murah', 1418), ('tas', 1238), ('pria', 1139), ('bayi', 1107), ('untuk', 1075), ('isi', 875), ('tangan', 864), ('kaos', 823), ('warna', 821), ('baju', 807), ('sepatu', 802), ('celana', 751), ('alat', 717), ('bahan', 701), ('polos', 683), ('rambut', 668), ('termurah', 582), ('bisa', 546), ('lampu', 534), ('mainan', 527), ('bpom', 501), ('hijab', 486), ('panjang', 478), ('sarung', 468), ('usb', 465), ('kain', 461), ('karakter', 454), ('sabun', 440), ('paket', 439), ('plastik', 433), ('hitam', 431), ('tempat', 422), ('jumbo', 410), ('gamis', 407), ('botol', 404), ('putih', 402), ('dengan', 401), ('kaki', 385), ('wajah', 384), ('jilbab', 383), ('grosir', 381), ('dompet', 373), ('tali', 364), ('pendek', 363), ('serbaguna', 348), ('kabel', 341), ('iphone', 338), ('buku', 335), ('tahun', 328), ('kotak', 323), ('pembersih', 311), ('lipat', 310), ('minyak', 309), ('bunga', 302), ('selempang', 299), ('kulit', 296), ('oppo', 292), ('mukena', 287), ('asli', 286), ('katun', 284)

In [10]:
with open("output/idwords.tsv", "w") as f:
    for t in mc:
        f.write(f"{t[0]}\t{t[1]}\n")

In [11]:
tmp = [w for w in text if w in en_words]
fd = nltk.FreqDist(tmp)
mc = fd.most_common()
with open("output/enwords.tsv", "w") as f:
    for t in mc:
        f.write(f"{t[0]}\t{t[1]}\n")