I am a newbie to Image Classification with ML. This analysis will help me figure out which model architecture should be picked.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from PIL import Image
import subprocess

import plotly.express as px
import random
import matplotlib.pyplot as plt

In [None]:
# load the train file
train = pd.read_csv("../input/shopee-product-matching/train.csv")
train.head()

In [None]:
# adapted from https://www.kaggle.com/isaienkov/shopee-data-understanding-and-analysis

In [None]:
def plot_images(images_number):
    plot_list = train['image'].sample(n=images_number).tolist()
    size = np.sqrt(images_number)
    if int(size)*int(size) < images_number:
        size = int(size) + 1
        
    plt.figure(figsize=(20, 20))
    
    ind=0
    for image_id in plot_list:
        plt.subplot(size, size, ind + 1)
        image = cv2.imread(str(Path('../input/shopee-product-matching/train_images/', image_id)))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(image_id, fontsize=12)
        plt.axis("off")
        ind+=1
    plt.show()

In [None]:
plot_images(16)

In [None]:
# number of unique label groups
n_labels = len(train["label_group"].unique())
n_ex = len(train)
print(f'Num of training examples: {n_ex}\nNum of labels: {n_labels}')

In [None]:
# plot histogram
df_cnt = train["label_group"].value_counts().reset_index()
df_cnt.columns = ["group_name", "num_of_objects"]
df_cnt["group_name"] = df_cnt["group_name"].astype(str)

fig = px.histogram(df_cnt, x="num_of_objects", nbins=10,log_y=True)
fig.show()

In [None]:
# read the descriptions for some groups
def plot_groups(list_large_groups, images_number=16):
    group_id = random.sample(list_large_groups, 1)[0]
    plot_list = train[train["label_group"]==group_id][['image', 'title']].sample(n=images_number)
    size = np.sqrt(images_number)
    if int(size)*int(size) < images_number:
        size = int(size) + 1
        
    plt.figure(figsize=(20, 20))
    
    ind=0
    for _, item in plot_list.iterrows():
        plt.subplot(size, size, ind + 1)
        image = cv2.imread(str(Path('../input/shopee-product-matching/train_images/', item["image"])))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(item["title"], fontsize=12)
        plt.axis("off")
        ind+=1
    plt.show()

In [None]:
df_g =  train["label_group"].value_counts().reset_index()
list_large_groups = list(df_g[df_g["label_group"]>20]["index"])
# list_large_groups[:10]

In [None]:
plot_groups(list_large_groups, images_number=16)

## Follow up:
### Image model
1. Should be number invariant
2. Backgroud invariant
3. Some photos could have face of models
4. Model can perform better if it can identify text from the images
5. Scale invariant

### Text model
1. Identify the language

## Identifying the language.

Let's start with something easy :)
I have refered tothis blog: [link](https://amitness.com/2019/07/identify-text-language-python/)

In [None]:
!wget -O /tmp/lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

In [None]:
import fasttext
from pycountry import languages

PRETRAINED_MODEL_PATH = '/tmp/lid.176.bin'
model = fasttext.load_model(PRETRAINED_MODEL_PATH)

In [None]:
sentences = train['title'].tolist()
predictions = model.predict(sentences)

In [None]:
language_code = [x[0].split("__label__")[1] for x,y in zip(predictions[0], predictions[1]) if y[0]>0.5]
language_name = [languages.get(alpha_2=x) for x in language_code]
language_name = [x.name if x else "none" for x in language_name]

In [None]:
from collections import Counter
cnt_lang = Counter(language_name)
cnt_lang.most_common()


In [None]:
# I could cross verify the model prdictions if I knew any other language from this list except English.
# I have seen some titles in English. For other languages, I will trust that the model is right

### Follow up:
1. I guess it will be okay to use English pretrained model. Even if the title is not in English, some of the product names could be in English.
2. Other option could be character based NLP model.


### Baseline models using titles

Use BERT to get sentence embeddings, and will do clustering

Adopting from this [notebook](https://www.kaggle.com/brendanhasz/bert-in-kernels)


In [None]:
!pip install tensorflow==1.15
# Install bert-as-service
!pip install bert-serving-server==1.10.0
!pip install bert-serving-client==1.10.0
# Download and unzip the pre-trained model
!wget http://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

In [None]:
# Start the BERT server
bert_command = 'bert-serving-start -model_dir /kaggle/working/uncased_L-12_H-768_A-12'
process = subprocess.Popen(bert_command.split(), stdout=subprocess.PIPE)
# Start the BERT client
from bert_serving.client import BertClient
bc = BertClient()

In [None]:
# read the test file
df_test = pd.read_csv("../input/shopee-product-matching/test.csv")
df_test.head()

In [None]:
len(df_test)

In [None]:
titles = df_test['title'].tolist()
embeddings = bc.encode(titles)

In [None]:
## Clustering with DBSCAN
from sklearn.cluster import DBSCAN

In [None]:
clustering = DBSCAN(eps=3, min_samples=2).fit_predict(embeddings)

In [None]:
def cluster_label_to_submission(labels, df_test):
    submission = []
    match_id_dict = {}
    for idx, label in enumerate(labels):
        posting_id = df_test.loc[idx, "posting_id"]
        if label == -1:
            match_id = posting_id
        else:
            if label in match_id_dict:
                match_id = match_id_dict[label]
            else:
                match_idx = [idx for idx, l in enumerate(labels) if l==label]
                match_id = " ".join(df_text.loc[match_idx, "posting_id"].to_list())
                match_id_dict[label] = match_id
        submission.append([posting_id, match_id])
    df = pd.DataFrame(submission, columns = ["posting_id", "matches"])
    return df

In [None]:
df_submission = cluster_label_to_submission(clustering, df_test)

In [None]:
df_submission.to_csv("./submission.csv")