In [4]:
import gc, os, warnings, re, json, pickle
import torch
import torch.nn as nn

import pandas as pd
import matplotlib.pyplot as plt

from datasets import get_dataset_config_names, load_dataset
from transformers import AutoTokenizer, AutoConfig, AutoModel
from transformers import AutoModelForQuestionAnswering,BitsAndBytesConfig
from peft import PeftType, TaskType
from peft import get_peft_config, get_peft_model, LoraConfig, replace_lora_weights_loftq
from peft import PromptEncoderConfig, PromptEncoder

import streamlit
from torch.utils.data import Dataset
from torch import Tensor
from elasticsearch import Elasticsearch

from typing import Any, Dict
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["LRU_CACHE_CAPACITY"] = "4096"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "garbage_collection_threshold:0.8, max_split_size_mb:32"

In [5]:
""" set default device to mps """

device = torch.device("cuda" if torch.cuda.is_available() else "mps")
device

device(type='mps')

In [6]:
""" Connect to Elastic Search Server """

try:
    es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", "IJ_+q4tOV1vCl-Yt51U1"),
    ca_certs="/Users/qcqced/Desktop/ElasticSearch/elasticsearch-8.12.2/config/certs/http_ca.crt"
    )
    print(es.ping())

except ConnectionError as e:
    print("Connection Error:", e)

False


In [10]:
""" helper function for loading the dataset  """

def load_pkl(filepath: str) -> Any:
    """ Load pickle file

    Examples:
        filepath = './dataset_class/data_folder/train.pkl'
    """
    with open(f'{filepath}', 'rb') as file:
        output = pickle.load(file)
    return output

def load_json(filepath: str) -> Any:
    """ Load json file

    Examples:
        filepath = './dataset_class/data_folder/train.json'
    """
    with open(f'{filepath}', 'r') as file:
        output = json.load(file)
    return output


def load_parquet(filepath: str) -> Dict:
    """ Load parquet file

    Examples:
        filepath = './dataset_class/data_folder/train.parquet'
    """
    output = pd.read_parquet(filepath).to_dict()
    return output


def load_csv(filepath: str) -> pd.DataFrame:
    """ Load csv file

    Examples:
        filepath = './dataset_class/data_folder/train.csv'
    """
    output = pd.read_csv(filepath).to_dict()
    return output


def load_all_types_dataset(path: str) -> Dict:
    """ Load all pickle files from folder

    Args:
        path: path in your local directory

    Examples:
        load_all_types_dataset('./data_folder/squad2/train.json')
        load_all_types_dataset('./data_folder/yahoo_qa/test.csv')
        load_all_types_dataset('./data_folder/yelp_review/train_0.parquet')

    All of file types are supported: json, csv, parquet, pkl
    And Then, they are converted to dict type in python
    """
    file_types = path.split('.')[-1]
    if file_types == 'pkl': output = load_pkl(path)
    elif file_types == 'json': output = load_json(path)
    elif file_types == 'parquet': output = load_parquet(path)
    elif file_types == 'csv': output = load_csv(path)
    return output


In [11]:
""" Helper function for cleansing & normalizing the text """

def no_char(text):
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
    text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
    text = re.sub(r"\s+[a-zA-Z]$", " ", text)
    return text


def no_multi_spaces(text):
    return re.sub(r"\s+", " ", text, flags=re.I)


def underscore_to_space(text: str):
    text = text.replace("_", " ")
    text = text.replace("-", " ")
    return text


def preprocess_text(source):
    """ Remove all the special characters
    """
    source = re.sub(r'\W', ' ', str(source))
    source = re.sub(r'^b\s+', '', source)
    source = source.lower()
    return source


def cleaning_words(text: str) -> str:
    """ Apply all of cleaning process to text data
    """
    tmp_text = underscore_to_space(text)
    tmp_text = no_char(tmp_text)
    tmp_text = preprocess_text(tmp_text)
    tmp_text = no_multi_spaces(tmp_text)
    return tmp_text

In [15]:
""" Merge Two DataFrames """

df1 = pd.DataFrame.from_dict(load_all_types_dataset('./product_search_tutorial/myntra1.parquet'))
df2 = pd.DataFrame.from_dict(load_all_types_dataset('./product_search_tutorial/myntra2.parquet'))

df = pd.concat([df1, df2], axis=0)
df

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,image
0,16149,Women,Apparel,Topwear,Tshirts,White,Fall,2011.0,Casual,Tokyo Talkies Women Printed White T-shirt,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...
1,9484,Unisex,Accessories,Bags,Backpacks,Black,Fall,2011.0,Casual,Puma Unisex Ferrari Replica Black Backpacks,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...
2,19101,Women,Apparel,Topwear,Kurtas,Blue,Fall,2011.0,Ethnic,Mother Earth Women Solid Sea Blue Kurta,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...
3,8556,Men,Apparel,Topwear,Tshirts,Orange,Fall,2011.0,Casual,Probase Men Speed Ski the Best performer Orang...,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...
4,16975,Men,Footwear,Sandal,Sandals,Grey,Fall,2011.0,Casual,Puma Men Rover Grey Sandal,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...
...,...,...,...,...,...,...,...,...,...,...,...
22031,19333,Women,Apparel,Topwear,Jackets,Navy Blue,Fall,2011.0,Casual,United Colors of Benetton Women Solid Navy Blu...,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...
22032,28646,Women,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Sports,Nike Women Bleach Grey T-shirt,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...
22033,2850,Men,Apparel,Topwear,Tshirts,Grey,Summer,2011.0,Casual,Mr.Men Men's Mr.Funny Grey T-shirt,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...
22034,34219,Men,Apparel,Topwear,Tshirts,Sea Green,Summer,2013.0,Casual,Proline Sea Green Polo T-shirt,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...


In [16]:
""" Dict Object for Index Mapping in elasticsearch """
indexName = "all_products"

indexMapping = {
    "properties":{
        "ProductID":{
            "type":"long"
        },
        "ProductName":{
            "type":"text"
        },
        "ProductBrand":{
            "type":"text"
        },
        "Gender":{
            "type":"text"
        },
        "Price (INR)":{
            "type":"long"
        },
        "NumImages":{
            "type":"long"
        },
        "Description":{
            "type":"text"
        },
        "PrimaryColor":{
            "type":"text"
        },
        "DescriptionVector":{
            "type":"dense_vector",
            "dims": 768,
            "index":True,
            "similarity": "cosine"
        }

    }
}

In [8]:
""" Fine-Tune with product data
1) load pretrained tokenizer, model
"""
max_len = 512
model_name = 'sentence-transformers/paraphrase-MiniLM-L6-v2'   # UPKLAB 거기꺼 안쓰네
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_cfg = AutoConfig.from_pretrained(model_name)
model = AutoModel.from_pretrained(
    model_name,
    config=model_cfg
)

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [ ]:
def tokenizing(text: str, padding: bool or str = 'max_length') -> Any:
    """ Preprocess text for LLM Input, for common batch system

    Args:
        cfg: configuration.CFG, needed to load tokenizer from Huggingface AutoTokenizer
        text: text from dataframe or any other dataset, please pass str type
        padding: padding options, default 'max_length', if you want use smart batching, init this param to False
    """
    inputs = tokenizer.encode_plus(
        text,
        max_length=max_len,
        padding=padding,
        truncation=True,
        return_tensors=None,
        add_special_tokens=False,  # later, we will add ourselves
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v)
    return inputs

class ProductSearchDataset(Dataset):
    """ Custom Dataset for Pretraining Task in NLP, such as MLM, CLM, ... etc
    """
    def __init__(self, inputs: pd.DataFrame) -> None:
        self.inputs = inputs
        self.ids = self.inputs.id

    def __len__(self) -> int:
        return len(self.inputs)

    def __getitem__(self, item: int) -> Dict[str, Tensor]:
        prompts = '' + tokenizer.cls_token
        for col in self.inputs.columns:
            prompts += f"{self.inputs[col][item]}" + tokenizer.sep_token
        
        batch_prompt = tokenizing(prompts, padding=False)
        for k, v in batch_prompt.items():
            batch_prompt[k] = torch.as_tensor(v)
        return batch_prompt

In [12]:
""" Model class for Fine-Tuning Pretrained Model for Semantic Search """


class MeanPooling(nn.Module):
    """ Module for pure mean pooling """
    def __init__(self, auto_cfg):
        super(MeanPooling, self).__init__()

    @staticmethod
    def forward(last_hidden_state: Tensor, attention_mask: Tensor) -> Tensor:
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)  # if lower than threshold, replace value to threshold (parameter min)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class SemanticSearchModel(nn.Module):
    """ Model for Semantic Search
    """
    def __init__(self):
        super(SemanticSearchModel, self).__init__()
        self.model = model
        self.mean_pooling = MeanPooling(model_cfg)
    
    def forward(self, inputs: Dict[str, Tensor]) -> Tensor:
        hidden_states = self.model(**inputs).last_hidden_state
        h = self.mean_pooling(hidden_states, inputs['attention_mask'])
        return h

In [14]:
""" Just project the text to model hidden state dimension """

def search(input_query: str):
    h = model.encode(input_query)
    query = {
        "field": "DescriptionVector",
        "query_vector": h,
        "k": 30,
        "num_candidates": 500
    }
    
    candidates = es.knn_search(
        index="all_products",
        knn=query,
        source=['"ProductName', 'Description'] 
    )
    results = candidates['hits']['hits']
    return results

def main():
    streamlit.title("Search Fashion Products")
    query = streamlit.text_input("Enter your query here")
    if streamlit.button("Search"):
        if query:
            results = search(query)
            streamlit.subheader("Search Results")
            for result in results:
                with streamlit.container():
                    if '_source' in result:
                        try:
                            streamlit.header(f"{result['_source']['ProductName']}")
                        except Exception as e:
                            print(e)
                        try:
                            streamlit.write(result['_source']['Description'])
                        except Exception as e:
                            print(e)
                    
                    streamlit.divider()
main()