<h1><center>Shopee - Data understanding and analysis</center></h1>

<center><img src="https://klgadgetguy.com/wp-content/uploads/2018/10/6ce1f4f6d79353c5f24ee047a5132d77.jpg"></center>

<div class="list-group" id="list-tab" role="tablist">
<h2 class="list-group-item list-group-item-action active" data-toggle="list" style='background:orange; border:0; color:white' role="tab" aria-controls="home"><center>Quick navigation</center></h2>

* [1. Training set](#1)
* [2. Images visualization](#2)
* [3. Titles analysis](#3)
* [4. Test set](#4)
* [5. Simple model for the same titles](#5)

In [None]:
import numpy as np
import pandas as pd 

import os
import plotly.express as px

import matplotlib.pyplot as plt
import cv2
from wordcloud import WordCloud, STOPWORDS

<a id="1"></a>
<h2 style='background:orange; border:0; color:white'><center>1. Training set</center><h2>

**[train/test].csv** - the training set metadata. Each row contains the data for a single posting. Multiple postings might have the exact same image ID, but with different titles or vice versa.

* posting_id - the ID code for the posting.

* image - the image id/md5sum.

* image_phash - a perceptual hash of the image.

* title - the product description for the posting.

* label_group - ID code for all postings that map to the same product. Not provided for the test set.

In [None]:
train = pd.read_csv('/kaggle/input/shopee-product-matching/train.csv')
train

#### Let's check products that are in the same group.

In [None]:
ds = train['label_group'].value_counts().reset_index()
ds.columns = ['label_group', 'products_count']
ds['label_group'] = ds['label_group'].astype(str)
ds = ds.sort_values(['products_count'])

fig = px.bar(
    ds.tail(50), 
    x="products_count", 
    y="label_group", 
    orientation='h', 
    title='Top 40 groups by number of products', 
    width=800, 
    height=1000
)

fig.show()

<a id="2"></a>
<h2 style='background:orange; border:0; color:white'><center>2. Images visualization</center><h2>

In [None]:
def plot_images(images_number):
    
    plot_list = train['image'].sample(n=images_number).tolist()
    size = np.sqrt(images_number)
    if int(size)*int(size) < images_number:
        size = int(size) + 1
        
    plt.figure(figsize=(20, 20))
    
    ind=0
    for image_id in plot_list:
        plt.subplot(size, size, ind + 1)
        image = cv2.imread(os.path.join('../input/shopee-product-matching/train_images/', image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(image_id, fontsize=12)
        plt.axis("off")
        ind+=1
    plt.show()

In [None]:
plot_images(16)

#### Let's see products from the largest groups.

In [None]:
def plot_images(group):
    
    plot_list = train[train['label_group'] == group]
    plot_list = plot_list['image'].tolist()
    images_number = len(plot_list)
    size = np.sqrt(images_number)
    if int(size)*int(size) < images_number:
        size = int(size) + 1
        
    plt.figure(figsize=(20, 20))
    
    ind=0
    for image_id in plot_list:
        plt.subplot(size, size, ind + 1)
        image = cv2.imread(os.path.join('../input/shopee-product-matching/train_images/', image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(image_id, fontsize=6)
        plt.axis("off")
        ind+=1
    plt.show()

#### Product 3627744656

In [None]:
plot_images(3627744656)

In [None]:
sample = train[train['label_group'] == 3627744656]
print('Total number of items in group 3627744656: ' + str(len(sample)) + ', number of unique titles: ' + str(sample['title'].nunique()))

#### Product 1163569239

In [None]:
plot_images(1163569239)

In [None]:
sample = train[train['label_group'] == 1163569239]
print('Total number of items in group 1163569239: ' + str(len(sample)) + ', number of unique titles: ' + str(sample['title'].nunique()))

#### Product 994676122

In [None]:
plot_images(994676122)

In [None]:
sample = train[train['label_group'] == 994676122]
print('Total number of items in group 994676122: ' + str(len(sample)) + ', number of unique titles: ' + str(sample['title'].nunique()))

#### Product 3113678103

In [None]:
plot_images(3113678103)

In [None]:
sample = train[train['label_group'] == 3113678103]
print('Total number of items in group 3113678103: ' + str(len(sample)) + ', number of unique titles: ' + str(sample['title'].nunique()))

#### Product 1141798720

In [None]:
plot_images(1141798720)

In [None]:
sample = train[train['label_group'] == 1141798720]
print('Total number of items in group 1141798720: ' + str(len(sample)) + ', number of unique titles: ' + str(sample['title'].nunique()))

## So we can see a lot of exact duplicates here and some samples even have the same titles.

#### Let's check now the smallest groups.

#### Product 2329453736	

In [None]:
plot_images(2329453736)

#### Product 1591152872	

In [None]:
plot_images(1591152872)

#### Product 4057376991	

In [None]:
plot_images(4057376991)

In [None]:
ds = train['image_phash'].value_counts().reset_index()
ds.columns = ['image_phash', 'products_count']
ds['image_phash'] = ds['image_phash'].astype(str)
ds = ds.sort_values(['products_count'])

fig = px.bar(
    ds.tail(50), 
    x="products_count", 
    y="image_phash", 
    orientation='h', 
    title='Top 40 image_phash by number of products', 
    width=800, 
    height=1000
)

fig.show()

In [None]:
def plot_images(group):
    
    plot_list = train[train['image_phash'] == group]
    plot_list = plot_list['image'].tolist()
    images_number = len(plot_list)
    size = np.sqrt(images_number)
    if int(size)*int(size) < images_number:
        size = int(size) + 1
        
    plt.figure(figsize=(20, 20))
    
    ind=0
    for image_id in plot_list:
        plt.subplot(size, size, ind + 1)
        image = cv2.imread(os.path.join('../input/shopee-product-matching/train_images/', image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(image_id, fontsize=6)
        plt.axis("off")
        ind+=1
    plt.show()

#### fad28daa2ad05595

In [None]:
plot_images('fad28daa2ad05595')

#### d0c0ea37bd9acce0

In [None]:
plot_images('d0c0ea37bd9acce0')

#### be12e12f9ec1e198

In [None]:
plot_images('be12e12f9ec1e198')

#### f6d98134b904b56b

In [None]:
plot_images('f6d98134b904b56b')

<a id="3"></a>
<h2 style='background:orange; border:0; color:white'><center>3. Titles analysis</center><h2>

In [None]:
train['title_len'] = train['title'].str.len()
train

In [None]:
fig = px.histogram(
    train, 
    x="title_len",
    width=800,
    height=500,
    title='Title length distribution'
)
fig.show()

In [None]:
def build_wordcloud(df, title):
    wordcloud = WordCloud(
        background_color='black', 
        stopwords=set(STOPWORDS), 
        max_words=50, 
        max_font_size=40, 
        random_state=666
    ).generate(str(df))

    fig = plt.figure(1, figsize=(14,14))
    plt.axis('off')
    fig.suptitle(title, fontsize=16)
    fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
wordcloud = WordCloud(
    background_color='black', 
    stopwords=set(STOPWORDS), 
    max_words=100, 
    max_font_size=40, 
    random_state=666
).generate(str(train['title']))

In [None]:
ds = pd.DataFrame.from_dict(list(wordcloud.words_.items()))
ds.columns = ['word', 'score']
ds = ds.sort_values(['score'])
ds['word'] = ds['word'].astype(str)

fig = px.bar(
    ds.tail(50), 
    x="score", 
    y="word", 
    orientation='h', 
    title='Top 50 words in titles', 
    width=800, 
    height=1000
)

fig.show()

In [None]:
build_wordcloud(train['title'], 'Wordcloud for train set titles')

<a id="4"></a>
<h2 style='background:orange; border:0; color:white'><center>4. Test set</center><h2>

In [None]:
test =  pd.read_csv('/kaggle/input/shopee-product-matching/test.csv')
test

**sample_submission.csv** - a sample submission file in the correct format.

* posting_id - the ID code for the posting.

* matches - Space delimited list of all posting IDs that match this posting. Posts always self-match. Group sizes were capped at 50, so there's no need to predict more than 50 matches.

In [None]:
sample = pd.read_csv('/kaggle/input/shopee-product-matching/sample_submission.csv')
sample

<a id="5"></a>
<h2 style='background:orange; border:0; color:white'><center>5. Simple model for the same titles</center><h2>

#### Let's assume that the records with exactly the same titles are the same products 

In [None]:
check = test.groupby(['title']).count().reset_index()['title'].tolist()
a = []
b = []
for item in check:
    res = test[test['title']== item]['posting_id'].tolist()
    ans = ""
    for id_item in res:
        ans = ans + str(id_item) + " "
    for id_item in res:
        a.append(id_item)
        b.append(ans)

In [None]:
submission1 = pd.DataFrame()
submission1['posting_id'] = a
submission1['matches'] = b
submission1

In [None]:
check = test.groupby(['image_phash']).count().reset_index()['image_phash'].tolist()
a = []
b = []
for item in check:
    res = test[test['image_phash']== item]['posting_id'].tolist()
    ans = ""
    for id_item in res:
        ans = ans + str(id_item) + " "
    ans = ans[:-1]
    for id_item in res:
        a.append(id_item)
        b.append(ans)

In [None]:
submission2 = pd.DataFrame()
submission2['posting_id'] = a
submission2['matches'] = b
submission2

In [None]:
sub = pd.merge(submission1, submission2, on='posting_id', how='inner')
sub['list'] = sub['matches_x'] + sub['matches_y']
sub

In [None]:
final = []
for index, row in sub.iterrows():
    res = list(set(row['list'].split(' ')))
    ans = ""
    for item in res:
        ans = ans + str(item) + " "
    ans = ans[:-1]
    final.append(ans)
    
submission = pd.DataFrame()
submission['posting_id'] = sub['posting_id']
submission['matches'] = final
submission

In [None]:
submission.to_csv('submission.csv', index=False)