# About This Notebook

## Preprocess

In this notebook, I tried tfidf with text preprocess. <br> 

- remove number 
- remove stopword 
- remove word which length is lower than 3 
- use Indonesian to english translator [in this discussion](https://www.kaggle.com/c/shopee-product-matching/discussion/228358) 

## How to split CV 

My cv is from [this notebook](https://www.kaggle.com/tmhrkt/shopee-cv-splitting-way). 

## CV score 
Preprocess worked for improving cv score. 

|fold|f1score in cv|
|:--:|:--:|
|0|0.7426|
|1|0.73746|
|2|0.74517|
|3|0.72959|
|4|0.74731| 

## LB score 
|th|lb|
|:--:|:--:|
|0.60||
|0.65|0.602|
|0.70|0.596|
|0.75|0.585|
|0.80|0.568|

# Configuration

In [None]:
DEBUG = False 
INPUT_DIR = "../input/shopee-product-matching/" 
OUT_DIR = "./"

NB = "11"
VERSION = 2
N_BATCH = 10  
TYPE = "GPU"
SUBMIT = True 

class CFG:
    max_features = 25000
    th = 0.60      
    
    remove_stopword = True
    translate = True 
    stemmer = False 
    fold = -1

# Library

In [None]:
import numpy as np
import pandas as pd
import pickle,gc
import time 
import cv2, matplotlib.pyplot as plt

import nltk

if TYPE == "GPU":
    import cudf, cuml, cupy
    from cuml.feature_extraction.text import TfidfVectorizer
else:
    from sklearn.feature_extraction.text import TfidfVectorizer

# Utils

In [None]:
class Timer():
    # Requrements
    # import time
    def __init__(self):
        self.cnt = 0
    def start(self):
        self.start_time = time.time()
        print(f"Time{self.cnt} START ")
    def stop(self):
        s = int(time.time() - self.start_time)  
        h = s//(3600) 
        s -= h*3600 
        m = s//60 
        s -= m*60
        print(f"Time{self.cnt} : {h}h {m}m {s}s")  
        self.cnt += 1

class Logger():
    # Requirements
    # import pickle
    def __init__(self):
        pass 
    def dump(self,obj,dir_name):
        f = open(dir_name,"wb")
        pickle.dump(obj,f)
        f.close

# Dataset

In [None]:
if SUBMIT:
    df = pd.read_csv("../input/shopee-product-matching/test.csv")
else:
    train = pd.read_csv("../input/shopee-fold/train_folds.csv")
    if CFG.fold < 0:
        df = train
    else:
        df = train[train["fold"] == CFG.fold].reset_index(drop=True)
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    
if DEBUG:
    df = df.sample(n = 5).reset_index(drop=True)
    
print('df shape is', df.shape )
df.head(5)

# Metric

In [None]:
def row_wise_f1_score(labels,preds):
    scores = [] 
    for label,pred in zip(labels,preds):
        n = len(np.intersect1d(label,pred))
        score = 2*n/(len(label) + len(pred))
        scores.append(score)
    return scores,np.mean(scores)

# Preprocess

## Stopword

In [None]:
# from nltk.corpus import stopwords
# stopwords = stopwords.words("english")
# + 
# [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/',  '\\', ':', ';', '<', '=', '>','?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\t','\n',"'",",",'~' , '—']
# + 
# ["she's", 'at', 'further', "should've", 'can', 'it', 'theirs', 'me', 'not', 'that', 'any', 'itself', 'did', 'such', 'as', 'x8f', 'both', 'having', 'under', "shan't", 'should', "mustn't", 'with', 'x87', 'whom', 'couldn', 'be', "weren't", 'myself', 'd', "didn't", 've', 'have', 'up', 'same', 'above', 'all', 'after', 'so', "aren't", 'his', 'of', 'between', 'll', 'what', 'those', "wouldn't", 'your', 'when', 'haven', "won't", 'below', 'her', 'she', 'until', 'why', 'its', 'down', 'the', 'here', 'where', 'has', 'own', 'wouldn', 'aren', "don't", 'doing', 'xb8', 'themselves', 'this', 'there', 'how', 's', 'don', 'ain', 'x9f', 'to', 'xc3', 'now', 't', 'again', 'is', "you've", "doesn't", "wasn't", 'during', 'x9d', 'xa2', "isn't", 'very', 'x90', 'through', 'from', 'ourselves', 'in', 'out', 'on', 'are', "couldn't", 'didn', 'an', 'ma', 'do', 'been', 'they', "it's", 'mightn', 'y', "you're", "haven't", 'each', 'because', "hadn't", 'other', 'their', 'my', 'off', "needn't", 'was', 'hers', 'some', 'weren', 'xa4', 'were', 'or', 'shan', 'hasn', 'a', 'he', 'no', 'over', 'xe2', 'xef', 'before', 'by', 'will', 'i', 'about', 'am', 'our', 'shouldn', 'just', 'xf0', 're', 'had', 'who', 'm', 'hadn', 'you', "shouldn't", 'won', 'while', 'yourselves', 'but', 'needn', 'against', 'too', "you'll", 'yours', 'being', 'does', 'xad', 'x80', "that'll", 'few', 'we', 'herself', 'him', 'then', 'himself', 'isn', 'than', "mightn't", 'for', 'once', 'them', 'these', 'more', 'and', 'doesn', 'mustn', 'most', "hasn't", 'xbf', 'wasn', "you'd", 'into', 'which', 'nor', 'if', 'ours', 'o', 'yourself', 'only', 'x89']
# + 
# unit 
# + 
# single character 

a = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])
b = set([ '!', '"', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/',  '\\', ':', ';', '<', '=', '>','?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\t','\n',"'",",",'~' , '—'])
c = set(["she's", 'at', 'further', "should've", 'can', 'it', 'theirs', 'me', 'not', 'that', 'any', 'itself', 'did', 'such', 'as', 'x8f', 'both', 'having', 'under', "shan't", 'should', "mustn't", 'with', 'x87', 'whom', 'couldn', 'be', "weren't", 'myself', 'd', "didn't", 've', 'have', 'up', 'same', 'above', 'all', 'after', 'so', "aren't", 'his', 'of', 'between', 'll', 'what', 'those', "wouldn't", 'your', 'when', 'haven', "won't", 'below', 'her', 'she', 'until', 'why', 'its', 'down', 'the', 'here', 'where', 'has', 'own', 'wouldn', 'aren', "don't", 'doing', 'xb8', 'themselves', 'this', 'there', 'how', 's', 'don', 'ain', 'x9f', 'to', 'xc3', 'now', 't', 'again', 'is', "you've", "doesn't", "wasn't", 'during', 'x9d', 'xa2', "isn't", 'very', 'x90', 'through', 'from', 'ourselves', 'in', 'out', 'on', 'are', "couldn't", 'didn', 'an', 'ma', 'do', 'been', 'they', "it's", 'mightn', 'y', "you're", "haven't", 'each', 'because', "hadn't", 'other', 'their', 'my', 'off', "needn't", 'was', 'hers', 'some', 'weren', 'xa4', 'were', 'or', 'shan', 'hasn', 'a', 'he', 'no', 'over', 'xe2', 'xef', 'before', 'by', 'will', 'i', 'about', 'am', 'our', 'shouldn', 'just', 'xf0', 're', 'had', 'who', 'm', 'hadn', 'you', "shouldn't", 'won', 'while', 'yourselves', 'but', 'needn', 'against', 'too', "you'll", 'yours', 'being', 'does', 'xad', 'x80', "that'll", 'few', 'we', 'herself', 'him', 'then', 'himself', 'isn', 'than', "mightn't", 'for', 'once', 'them', 'these', 'more', 'and', 'doesn', 'mustn', 'most', "hasn't", 'xbf', 'wasn', "you'd", 'into', 'which', 'nor', 'if', 'ours', 'o', 'yourself', 'only', 'x89'])
unit = set(["cm","gr","mm","m","kg","ml","g","mg","l"]) 

remove_word = b
stopwords = a|c 

def remove_stopwords(text):
    sentence = []
    for word in text.split(): 
        word = word.lower()
        if DEBUG:
            print(word)
        # remove 1 character 
        x = []
        for c in word:
            if c in remove_word:
                continue
            x.append(c)
        word = "".join(x)
        if word.isnumeric():
            continue
        elif word in stopwords:
            continue
        elif len(word) < 3:
            continue
        sentence.append(word)
    return " ".join(sentence)

## translate

In [None]:
dictionary = {"wanita": "woman", "anak": "child", "bayi": "baby", "tas": "bag", "masker": "face mask", "pria": "men", "murah": "cheap", "tangan": "hand", "alat": "tool", "motif": "motive", "warna": "color", "bahan": "material", "celana": "pants", "baju": "clothes", "kaos": "t-shirt", "sepatu": "shoes", "rambut": "hair", "mainan": "toy", "sarung": "holster", "polos": "plain", "rak": "rack", "botol": "bottle", "sabun": "soap", "kain": "fabric", "panjang": "long", "kabel": "cable", "buku": "book", "plastik": "plastic", "mobil": "car", "hitam": "black", "karakter": "character", "putih": "white", "dompet": "purse", "kaki": "feet", "pembersih": "cleaners", "lipat": "folding", "silikon": "silicone", "minyak": "oil", "isi": "contents", "paket": "package", "susu": "milk", "gamis": "robe", "mandi": "bath", "madu": "honey", "kulit": "skin", "serbaguna": "multipurpose", "bisa": "can", "kacamata": "spectacles", "pendek": "short", "tali": "rope", "selempang": "sash", "topi": "hat", "obat": "drug", "gantungan": "hanger", "tahun": "year", "jilbab": "hijab", "dapur": "kitchen", "dinding": "wall", "kuas": "brush", "perempuan": "woman", "katun": "cotton", "sepeda": "bike", "lucu": "funny", "lengan": "arm", "kaca": "glass", "garansi": "warranty", "bunga": "flower", "handuk": "towel", "dewasa": "adult", "elektrik": "electric", "timbangan": "balance", "besar": "big", "bahan": "ingredient", "ransel": "backpack", "kertas": "paper",'bahan' : 'ingredient', 'bisa' : 'can', 'rak' : 'rack', 'panjang' : 'long', 'untuk' : 'to', 'rambut' : 'hair', 'bayi' : 'baby', 'celana' : 'pants', 'isi' : 'contents', 'grosir' : 'wholesaler', 'tas' : 'bag', 'kaki' : 'feet', 'kaos' : 't-shirt', 'lampu' : 'light', 'tali' : 'rope', 'pria' : 'men', 'dan' : 'and', 'plastik' : 'plastic', 'baju' : 'clothes', 'putih' : 'white', 'alat' : 'tool', 'paket' : 'package', 'mobil' : 'car', 'gamis' : 'robe', 'tempat' : 'the place', 'anak' : 'child', 'warna' : 'color', 'dompet' : 'purse', 'wanita' : 'women', 'wajah' : 'face', 'termurah' : 'cheapest', 'mainan' : 'toy', 'sabun' : 'soap', 'dengan' : 'with', 'jilbab' : 'hijab', 'hitam' : 'black', 'tangan' : 'hand', 'karakter' : 'character', 'murah' : 'cheap', 'sarung' : 'scabbard', 'sepatu' : 'shoes', 'pendek' : 'short', 'botol' : 'bottle', 'kain' : 'fabric'}
def translate(text):
    sentence = [] 
    for word in text.split():
        word = word.lower()
        if DEBUG:
            print(word)
        if word in dictionary:
            sentence.append(dictionary[word]) 
        else:
            sentence.append(word)
    return " ".join(sentence)

## stemmer

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

## data cleansing

In [None]:
timer = Timer() 
logger = Logger() 

if CFG.remove_stopword:
    timer.start()
    print("Remove stopword...")
    texts = df.title.values 
    for i in range(len(df)):
        if DEBUG:
            print("-"*100)
            print(f"[{i}]")
        text = texts[i]
        df.loc[i,"title"] = remove_stopwords(text)
    timer.stop()
        
if CFG.translate:
    timer.start()
    print("Translate...")
    texts = df.title.values 
    for i in range(len(df)):
        if DEBUG:
            print("-"*100)
            print(f"[{i}]")
        text = texts[i]
        df.loc[i,"title"] = translate(text)
    timer.stop() 

if CFG.stemmer:
    timer.start()
    print("Stemming...")
    texts = df.title.values 
    for i in range(len(df)):
        if DEBUG:
            print("-"*100)
            print(f"[{i}]")
        text = texts[i]
        df.loc[i,"title"] = lemmatize_stemming(text)
    timer.stop() 

if not SUBMIT:
    logger.dump(remove_word,f"remove_word_nb{NB}_ver{VERSION}.pkl")
    logger.dump(stopwords,f"stopword_nb{NB}_ver{VERSION}.pkl")
    logger.dump(dictionary,f"dictionary_nb{NB}_ver{VERSION}.pkl")
    
df.title.head()

# TFidf

In [None]:
if TYPE == "GPU":
    df_gf = cudf.DataFrame(df) 

In [None]:
print('Computing text embeddings...')
if TYPE == "GPU":
    model = TfidfVectorizer(stop_words='english', binary=True, max_features=CFG.max_features) 
    text_embeddings = model.fit_transform(df_gf.title).toarray()
else:
    model = TfidfVectorizer(stop_words='english', binary=True, max_features=CFG.max_features) 
    text_embeddings = model.fit_transform(df.title).toarray()
print('text embeddings shape',text_embeddings.shape)

In [None]:
preds = []
n = df.shape[0] 
bs = n//N_BATCH 
print('Finding similar titles...')

for i in range(N_BATCH):
    left = bs*i
    right = bs*(i + 1) 
    if i == N_BATCH - 1:
        right = n 
    print('chunk',left,'to',right)
    
    # COSINE SIMILARITY DISTANCE
    if TYPE == "GPU":
        cts = cupy.matmul(text_embeddings[left:right], text_embeddings.T) 
    else:
        cts = text_embeddings[left:right]@text_embeddings.T
    
    for k in range(right-left):
        if TYPE == "GPU":
            IDX = cupy.where(cts[k,]>CFG.th)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
        else:
            IDX = np.where(cts[k,]>CFG.th)[0]
            o = df.iloc[IDX].posting_id.values
        preds.append(o)

In [None]:
if not SUBMIT:
    logger.dump(text_embeddings,f"embed_tfidf_nb{NB}_ver{VERSION}_fold{CFG.fold}.pkl")

In [None]:
del model, text_embeddings
_ = gc.collect()

In [None]:
df['preds_list'] = preds
df['preds'] = df["preds_list"].apply(lambda x:" ".join(x))
df.head()

# Compute CV

In [None]:
if not SUBMIT:
    scores,score = row_wise_f1_score(df.target,preds) 
    df["score"] = scores 
    print(f"CV : f1score = {score}")
else:
    submission = pd.read_csv("../input/shopee-product-matching/sample_submission.csv")
    submission["matches"] = df["preds"]
    submission.to_csv("submission.csv",index=False)
    print(submission.head())  