# Summary

When you use thr CSR matrix for calculating the cosine similarity, you can get the results by ~ x10 faster compared to numpy matrix.
- CSR (from scipy): around 2min
- Numpy: around 15min

In [None]:
import os
import gc
import string
import re
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GroupKFold

import nltk

In [None]:
### CONFIGURATION ###
cfg ={'text':{'how': 'tfidf',
              'threshold': 0.6,
              'binary': True}
     }

In [None]:
### CONTROL ###
DEBUG = False
INFERENCE = False

### CONSTANTS ###
SEED = 157

### PATHS ###
TRAIN = '../input/shopee-product-matching/train.csv'
TEST = '../input/shopee-product-matching/test.csv'
SUB = '../input/shopee-product-matching/sample_submission.csv'

IMG_TRAIN = '../input/shopee-product-matching/train_images'
IMG_TEST = '../input/shopee-product-matching/test_images'

### SUB-FUNCTIONS ###
def get_df():
    df = pd.read_csv(TEST if INFERENCE else TRAIN)
    
    IMG_PATH = IMG_TEST if INFERENCE else IMG_TRAIN
    df['img_path'] = IMG_PATH + '/' + df['image']
    
    return df

def vectorize_text(text, how='tfidf'):
    vec = TfidfVectorizer(stop_words='english', max_features=25_000,
                          binary=cfg['text']['binary']) if how=='tfidf' else CountVectorizer(stop_words='english')
    text_vec = vec.fit_transform(text)
    return text_vec.toarray()
    
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [None]:
### FUNCTIION TO BE IMPLEMENTED ###
def preprocess_text(text, flg_stemm=False, flg_lemm=True):

    lst_stopwords = nltk.corpus.stopwords.words("english")
    
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()    
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [None]:
class TextClassify():
    
    def __init__(self, df):
        self.df = df
        self.df['matches'] = self.df['posting_id']
        
        self.X_train = vectorize_text(self.df['title'])
    
    def find_similar_titles(self, threshold_cosine=0.7, use_csr=True):
        X_csr = csr_matrix(self.X_train) if use_csr else self.X_train # <- CSR or not
        
        num_rows = X_csr.shape[0]
        
        chunk = 1024 * 4
        chunks = num_rows // chunk
        
        for chunk_start in range(chunks+1):
            row_start = chunk_start * chunk
            row_end = min((chunk_start+1) * chunk, X_csr.shape[0])
            print('chunk: ', row_start, '-> ', row_end)

            X_csr_T = X_csr[row_start: row_end].transpose() 
            cos_sim = (X_csr * X_csr_T).transpose() if use_csr else np.dot(X_csr, X_csr_T).transpose()
            if use_csr:
                cos_sim = cos_sim.toarray() > threshold_cosine
            else:
                cos_sim = cos_sim > threshold_cosine
        
            for i, row_num in enumerate(range(row_start, row_end)):
                idx = cos_sim[i]
                matches = self.df.loc[idx, 'posting_id'].to_list()
                self.df.loc[row_num, 'matches'] = ' '.join(matches) 

In [None]:
### GET DATAFRAME ###
df = get_df()
df['title'] = df['title'].apply(preprocess_text)

if DEBUG:
    gkf = GroupKFold(n_splits=10)
    for train_idx, val_idx in gkf.split(df['posting_id'], df['label_group'], groups=df['label_group']):
        df = df.iloc[val_idx].reset_index(drop=True)
        break

In [None]:
def main_cos_infer(df, threshold, use_csr=True):
    model = TextClassify(df)
    model.find_similar_titles(threshold_cosine=threshold, use_csr=use_csr)

## Using csr_matrix

In [None]:
%%time
main_cos_infer(df, threshold=cfg['text']['threshold'], use_csr=True)

## Using numpy matrix

In [None]:
%%time
main_cos_infer(df, threshold=cfg['text']['threshold'], use_csr=False)