In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import gc
import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

In [None]:
test=pd.read_csv(r'../input/shopee-product-matching/test.csv')
if len(test)<=3:
    train=pd.read_csv(r'../input/shopee-product-matching/train.csv')
else:
    train=pd.read_csv(r'../input/shopee-product-matching/test.csv')

train.shape

In [None]:
del test
gc.collect()

In [None]:
import re
def preprocess(description):
  # Actually not required as what we have is titles which usually doesn't contain words that we use for general communication.
    description=description.lower()
    description=re.sub('[-\n\t]+',' ',description)
    description= re.sub(r"won\'t", "will not",description)
    description=re.sub(r"can\'t", "can not",description)
    description=re.sub(r"n\'t", " not",description)
    description=re.sub(r"\'re", " are",description)
    description=re.sub(r"\'s", " is",description)
    description=re.sub(r"\'d", " would",description)
    description=re.sub(r"\'ll", " will",description)
    description=re.sub(r"\'t", " not",description)
    description=re.sub(r"\'ve", " have",description)
    description=re.sub(r"\'m", " am",description)
    description=re.sub('[^a-z0-9]+',' ',description)
    description=re.sub('\s+',' ',description)
    return description.strip()

In [None]:
clensed_train=[preprocess(title) for title in tqdm(train.title.values)]

In [None]:
tfidf=TfidfVectorizer()
embedded_train=tfidf.fit_transform(clensed_train).toarray()
embedded_train.shape

In [None]:
try:
    embedded_train=torch.from_numpy(embedded_train)
    embedded_train=embedded_train.cuda()
except:
    print('Exception')

In [None]:
matches=[]
chunksize=512
chunks=len(train)//chunksize if len(train)%chunksize==0 else (len(train)//chunksize)+1
for chunk in tqdm(range(chunks)):
    start=chunk*chunksize
    end=min(len(train),start+chunksize)
    cossim=torch.matmul(embedded_train,embedded_train[start:end].T).T
    cossim=cossim.data.cpu().numpy()
    for per_posting in cossim:
        indices=np.where(per_posting>=0.55)[0]
        match=train.iloc[indices].posting_id.values
        if len(match.tolist())>50:
            ind=np.where(per_posting>=sorted(per_posting)[-50])[0]
            match=train.iloc[ind].posting_id.values
        matches.append(' '.join(match.tolist()))

In [None]:
train['matches']=matches
train.head()

In [None]:
submission=train[['posting_id','matches']]
submission.head()

In [None]:
del embedded_train,train
gc.collect()

In [None]:
submission.to_csv('submission.csv',index=False)