In [104]:
import numpy as np
import pandas as pd
from langid.langid import LanguageIdentifier, model
from typing import List
import mylib

In [118]:
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 400)

In [106]:
ld = LanguageIdentifier.from_modelstring(model, norm_probs=True)

In [107]:
def detect_lang(row) -> str:
    t = ld.classify(row["title"])
    return f"{t[0]} {t[1]}"

In [108]:
%%time
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train["lang"] = train.apply(detect_lang, axis=1)
train[["lang", "lang_prob"]] = train["lang"].str.split(expand=True)
train["lang_prob"] = train["lang_prob"].astype(np.float32)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   posting_id   34250 non-null  object 
 1   image        34250 non-null  object 
 2   image_phash  34250 non-null  object 
 3   title        34250 non-null  object 
 4   label_group  34250 non-null  int64  
 5   lang         34250 non-null  object 
 6   lang_prob    34250 non-null  float32
dtypes: float32(1), int64(1), object(5)
memory usage: 1.7+ MB
CPU times: user 2min 1s, sys: 1min 1s, total: 3min 3s
Wall time: 32.8 s


In [109]:
cols = ["lang", "lang_prob", "title"]
train[cols].head()

Unnamed: 0,lang,lang_prob,title
0,en,0.263607,Paper Bag Victoria Secret
1,mt,0.286158,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO..."
2,en,0.924006,Maling TTS Canned Pork Luncheon Meat 397 gr
3,id,0.999651,Daster Batik Lengan pendek - Motif Acak / Camp...
4,fr,0.925362,Nescafe \xc3\x89clair Latte 220ml


In [110]:
nen = train[train["lang"] != "en"]
nen = nen.sort_values(["lang_prob"], ascending=False)

In [119]:
nen[cols].head()

Unnamed: 0,lang,lang_prob,title
25139,ms,1.0,[ROUND] Taplak meja BULAT [152] /taplak meja alas penutup cover meja anti air anti panas anti sobek
33971,it,1.0,Keju Mozarella Perfetto KEMASAN ASLI 250gr Higienis / Mozzarella Cheese Perfetto
20253,de,1.0,Little Palmerhaus - Mittens & Booties - Sarung Tangan & Kaki Bayi
26496,sv,1.0,Lampu Led Basic Hannochs 3w/5 Watt 7w/7Watt 9w/9Watt 11w/11 Watt 14w/14 Watt 17w /17Watt Bolam
23955,it,1.0,Keju Mozarella Perfetto 250gr / Keju Leleh / Keju Lumer / Keju Pizza


In [112]:
ld.classify("Tempelan kulkas magnet angka, tempelan angka magnet")

('tl', 0.9979885430902933)

In [113]:
ld.classify("Tempelan kulkas")

('ms', 0.6127106422219508)

In [114]:
ld.classify("PASHMINA KUSUT RAWIS POLOS CRINKLE SHAWL MURAH BANGET")

('es', 0.40662769962234463)

In [115]:
ld.classify("Kangaroo Teflon / Allu Fry Pan 18 cm - KG652")

('id', 0.3904426069276855)

In [116]:
ld.classify("CHOCO BALL LAGIE COKLAT Lagie Grosir Cokelat Lagie Murah Chocoball Kiloan Chocobal Coklat Kiloan 1Kg")

('en', 0.994568885141504)

In [117]:
ld.classify("Grosir")  # wholesaler

('fr', 0.24941384800986108)