In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
train_dir_path = '../input/shopee-product-matching/train_images'
train_csv_path = '../input/shopee-product-matching/train.csv'
test_dir_path = '../input/shopee-product-matching/test_images'
test_csv_path = '../input/shopee-product-matching/test.csv'

In [None]:
train_csv = pd.read_csv(train_csv_path)
train_csv.head(10)

In [None]:
test_csv = pd.read_csv(test_csv_path)

In [None]:
print("Total Training dataset size: ", len(train_csv))
print("Total Training dataset size: ", len(test_csv))

In [None]:
import matplotlib.pyplot as plt
from matplotlib.image import imread
import plotly
import random

In [None]:
def plot_random_images(directory,number_of_images):
    image_paths = os.listdir(directory)
    random.shuffle(image_paths)
    total_images = image_paths[:number_of_images]
    for i in range(len(total_images)):
        plt.subplot(330 + 1 + i)
        x = os.path.join(directory,total_images[i])
        image = imread(x)
        plt.imshow(image)
    plt.show()

In [None]:
plot_random_images(train_dir_path,9)

In [None]:
def images_with_same_label_groups(data,label,directory=train_dir_path):
    same_labels = data[data['label_group']==label]
    image_paths = same_labels['image'][:9].values
    for i in range(len(image_paths)):
        plt.subplot(330 + 1 + i)
        x = os.path.join(directory,image_paths[i])
        image = imread(x)
        plt.imshow(image)
    plt.show()

In [None]:
from collections import Counter 
def most_frequent_label_groups(data,k=10):
    count = Counter(data['label_group'])
    k_frequent_groups = sorted(count.items(), key=lambda x: x[1],reverse=True)[:k]    
    return k_frequent_groups

In [None]:
most_frequent_label_groups(train_csv)

In [None]:
images_with_same_label_groups(train_csv,1141798720,train_dir_path)

In [None]:
images_with_same_label_groups(train_csv,3113678103,train_dir_path)

In [None]:
images_with_same_label_groups(train_csv,3627744656,train_dir_path)

In [None]:
def images_with_same_titles(data,title,directory=train_dir_path):
    same_titles = data[data['title']==title]
    image_paths = same_titles['image'][:9].values
    for i in range(len(image_paths)):
        plt.subplot(330 + 1 + i)
        x = os.path.join(directory,image_paths[i])
        image = imread(x)
        plt.imshow(image)
    plt.show()

In [None]:
def most_frequent_titles(data,k=10):
    count = Counter(data['title'])
    k_frequent_titles = sorted(count.items(), key=lambda x: x[1],reverse=True)[:k]    
    return k_frequent_titles

In [None]:
top_k_titles = most_frequent_label_groups(train_csv)
top_k_titles

In [None]:
def plot_same_title_images(data,top_titles):
    for title,_ in top_titles:
        images_with_same_titles(data,title)

In [None]:
plot_same_title_images(train_csv,top_k_titles)

In [None]:
from wordcloud import WordCloud, STOPWORDS
def create_title_word_cloud(data):
    text = data.title.values
    wordcloud = WordCloud(
        width = 3000,
        height = 2000,
        background_color = 'black',
        stopwords = STOPWORDS).generate(str(text))
    fig = plt.figure(
        figsize = (40, 30),
        facecolor = 'k',
        edgecolor = 'k')
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()

In [None]:
create_title_word_cloud(train_csv)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

def generate_top_k_n_grams(data,k,n):
    text = data['title'].values
    word_vectorizer = CountVectorizer(ngram_range=n)
    sparse_matrix = word_vectorizer.fit_transform(text)
    frequencies = sum(sparse_matrix).toarray()[0]
    freq_df = pd.DataFrame({
        'words': word_vectorizer.get_feature_names(),
        'frequency': frequencies 
    })
    freq_df_sorted = freq_df.sort_values('frequency',ascending=False)[:k] 
    sns.barplot(x = 'words', y = 'frequency',data = freq_df_sorted,
            palette = 'hls',
    )
    plt.xticks(rotation=45,fontsize=13)
    plt.yticks(fontsize=13)
    plt.show()

In [None]:
generate_top_k_n_grams(train_csv,10,(1,1))

In [None]:
generate_top_k_n_grams(train_csv,10,(2,2))