<div style="display: flex; background-color: #3F579F;">
    <h1 style="margin: auto; font-weight: bold; padding: 30px 30px 0px 30px;" align="center">Automatically classify consumer goods - P6</h1>
</div>
<div style="display: flex; background-color: #3F579F; margin: auto; padding: 5px 30px 0px 30px;" >
    <h3 style="width: 100%; text-align: center; float: left; font-size: 24px;" align="center">| Notebook |</h3>
</div>
<div style="display: flex; background-color: #3F579F; margin: auto; padding: 10px 30px 30px 30px;">
    <h4 style="width: 100%; text-align: center; float: left; font-size: 24px;" align="center">Data Scientist course - OpenClassrooms</h4>
</div>

<div style="background-color: #506AB9;" >
    <h2 style="margin: auto; padding: 20px; color:#fff; ">1. Libraries and functions</h2>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">1.1. Libraries and functions</h3>
</div>

In [1]:
## General
import ast
import operator
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from tabulate import tabulate
from itertools import islice

%matplotlib inline
sns.set_theme(style="darkgrid")

## Scikit Learn 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import QuantileTransformer, StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import adjusted_rand_score

## Computer vision library
import cv2

## TensorFlow
import tensorflow as tf
from tensorboard.plugins import projector

%load_ext tensorboard

## Own specific functions 
from functions import *
from functions_nlp import *
from functions_img import *

## Images paths
ORIGINAL_IMAGES_PATH = "images/Flipkart/"
THUMBNAILS_IMAGES_PATH  = "images/Flipkart/thumbnails/"
CB_IMAGES_PATH = "images/Flipkart/thumbnails/contrast_and_brightness/" # path images with contrast and brightness edit
GRAY_IMAGES_PATH = CB_IMAGES_PATH + "gray_images/"
NR_IMAGES_PATH = GRAY_IMAGES_PATH + "noise_reduction/"


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Samir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Samir\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
XX

NameError: name 'XX' is not defined

<div style="background-color: #506AB9;" >
    <h2 style="margin: auto; padding: 20px; color:#fff; ">2. Importing files and Initial analysis</h2>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">2.1. Importing and preparing files</h3>
</div>

<div class="alert alert-block alert-info">
    We are going to load the dateset resulting from the RFM
</div>

In [None]:
data = pd.read_csv(r"datasets\flipkart_com-ecommerce_sample_1050.csv")

In [None]:
data.head()

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">2.2. Initial analysis</h3>
</div>

In [None]:
df_analysis(data, "data", columns=["uniq_id"], analysis_type="complete")

<div class="alert alert-block alert-info">
    <p>Plotting the percentage of missing values by features</p>
</div>

In [None]:
missing_values = data.isnull().sum(axis=0).sort_values(ascending=False)/len(data.index)*100

fig = plt.figure(figsize=(10, 7))
plot = sns.barplot(x=missing_values.index.tolist(), y=missing_values.values.tolist())
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".1f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=missing_values.index.tolist(), rotation=70, size=12, horizontalalignment="right")
plt.ylabel("%", size=12)
plt.xlabel("Features", size=12)
plt.title("Missing values percentage by feature", size=16)
plt.tight_layout()
plt.savefig("images/text_analysis/missing-values-percentage-by-feature.png")
plt.show()

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <ul>
        <li>There are some features that are not important to our text analysis. For example, pid, uniq_id, etc.</li>
        <li>The missing value percentage is higher only in one feature.</li>
    </ul>
</div>

<div style="background-color: #506AB9;" >
    <h2 style="margin: auto; padding: 20px; color:#fff; ">3. Selecting the features to work</h2>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">3.1. Analyzing the features</h3>
</div>

In [None]:
pd.set_option("display.max_rows", None) # show full of showing rows
pd.set_option("display.max_columns", None) # show full of showing cols
pd.set_option("display.max_colwidth", None) # show full width of showing cols
pd.set_option("display.float_format", lambda x: "%.5f" % x) # show full content in cell    

In [None]:
data.tail(1)

In [None]:
pd.reset_option("display.max_rows") # reset max of showing rows
pd.reset_option("display.max_columns") # reset max of showing cols
pd.reset_option("display.max_colwidth") # reset width of showing cols
pd.reset_option("display.float_format") # reset show full content in cell

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">3.1.1. Brand</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's analyze in detail the feature <b>brand</b></p>
</div>

In [None]:
data["brand"].unique()

<div class="alert alert-block alert-warning">
    <p>It seems that it does not add value to the problem. It contains the names of brands and probably it does not add value to classify images</p>
    <p>Despite that, we are going to keep it</p>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">3.1.2. Retail price</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's analyze in detail the feature <b>brand</b></p>
</div>

In [None]:
boxplot_histogram_qqplot(data["retail_price"], "Retail price")

<div class="alert alert-block alert-warning">
    <p><b>Observations / Conclusions</b></p>
    <p>Retail price doesn't have a normal distribution.</p>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">3.2. Selecting the features</h3>
</div>

<div class="alert alert-block alert-warning">
    <p>To select the features, first, we are going to <b>discard features</b> that do not add value to the problem</p>
    <ul style="list-style-type: square;">
        <li><b>crawl_timestamp</b></li>
        <li><b>product_url</b></li>
        <li><b>pid</b></li>
        <li><b>is_FK_Advantage_product</b></li>
        <li><b>product_rating</b></li>
        <li><b>overall_rating</b></li>
        <li><b>discounted_price</b></li>
    </ul> 
</div>

In [None]:
df_data = data.drop(columns=["crawl_timestamp", "product_url", "pid", "is_FK_Advantage_product",
                             "product_rating", "overall_rating", "discounted_price"],
                    axis=0).copy()

<div class="alert alert-block alert-success">
    <p>For now, we are going to keep the following features</p>
    <ul style="list-style-type: square;">
        <li><b>uniq_id</b> - we can use this feature to keep relations on the data</li>
        <li><b>product_name</b></li>
        <li><b>product_category_tree</b></li>
        <li><b>description</b></li>
        <li><b>retail_price</b></li>
        <li><b>brand</b></li>
        <li><b>product_specifications</b></li>        
    </ul> 
</div>

In [None]:
pd.set_option("display.max_rows", None) # show full of showing rows
pd.set_option("display.max_columns", None) # show full of showing cols
pd.set_option("display.max_colwidth", None) # show full width of showing cols
pd.set_option("display.float_format", lambda x: "%.5f" % x) # show full content in cell    

In [None]:
df_data.head(1)

In [None]:
pd.reset_option("display.max_rows") # reset max of showing rows
pd.reset_option("display.max_columns") # reset max of showing cols
pd.reset_option("display.max_colwidth") # reset width of showing cols
pd.reset_option("display.float_format") # reset show full content in cell

In [None]:
df_analysis(df_data, "df_data", type_analysis="complete")


<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>Now, we have a dataset with the following characteristic</p>
    <ul style="list-style-type: square;">
        <li>30% missing-values in <b>brand</b></li>
        <li>One cell with missing-value in <b>retail_price</b></li>
    </ul> 
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">3.2.1 Continuous Variables Transformation</h4>
</div>

<div class="alert alert-block alert-info">
    <p>We are going to apply Logarithmic transformation on <b>retail_price</b> to get a better Normal distribution</p>
</div>

In [None]:
df_data["retail_price_log"] = np.log(df_data["retail_price"])
df_data["retail_price_log2"] = np.log2(df_data["retail_price"])
df_data["retail_price_log10"] = np.log10(df_data["retail_price"])

In [None]:
boxplot_histogram_qqplot(df_data["retail_price_log"], "Retail price log")

In [None]:
boxplot_histogram_qqplot(df_data["retail_price_log2"], "Retail price log2")

In [None]:
boxplot_histogram_qqplot(df_data["retail_price_log10"], "Retail price log")


<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>Log2 has the higher variance </p>
</div>

<div class="alert alert-block alert-info">
    <p>Deleting others columns related to price</p>
</div>

In [None]:
df_data.drop(columns=["retail_price", "retail_price_log", "retail_price_log10"], inplace=True)

<div style="background-color: #506AB9;" >
    <h2 style="margin: auto; padding: 20px; color:#fff; ">4. Treating missing-values</h2>
</div>

<div class="alert alert-block alert-info">
    <p><b>To treat missing-values</b>, we are going to do as follows</p>
    <ol>
        <li>Fill missing-vallues in <b>brand</b> and <b>product_specifications</b> with empty-value.</li>
        <li>Fill missing-vallues in <b>retail_price</b> based on the median for the same <b>product category</b>.</li>
    </ol>
</div>
<div class="alert alert-block alert-info">
    <p>So, Being the feature called <b>brand</b> only one missing-value, let's proceed to complete it with empty-value</p>
</div>

In [None]:
df_data["brand"].fillna("", inplace=True)
df_data["product_specifications"].fillna("", inplace=True)

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">4.1. Analyzing "product_category_tree"</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Let's analyse the levels of the tree in <b>product_category_tree</b></p>
</div>

In [None]:
for i in [0, 10, 100, 1049]:
    print(df_data.loc[i, "product_category_tree"])

<div class="alert alert-block alert-warning">
    <p><b>Observations / Conclusions</b></p>
    <p>The levels in the tree does not seem equals in all records</p>
</div>

In [None]:
df_data["tree_levels"] = df_data["product_category_tree"].str.count(">>")

In [None]:
df_data["tree_levels"].max()

In [None]:
pd.set_option("display.max_rows", None) # show full of showing rows
pd.set_option("display.max_columns", None) # show full of showing cols
pd.set_option("display.max_colwidth", None) # show full width of showing cols
pd.set_option("display.float_format", lambda x: "%.5f" % x) # show full content in cell    

In [None]:
df_data.query("tree_levels == tree_levels.min()").head(1)

In [None]:
df_data.query("tree_levels == tree_levels.max()").head(1)

In [None]:
pd.reset_option("display.max_rows") # reset max of showing rows
pd.reset_option("display.max_columns") # reset max of showing cols
pd.reset_option("display.max_colwidth") # reset width of showing cols
pd.reset_option("display.float_format") # reset show full content in cell

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>There are from 2 until 6 levels in <b>product_category_tree</b></p>
</div>

<div class="alert alert-block alert-info">
    <p>Let's analyze the 2 first levels that are common in all records</p>
</div>

In [None]:
df_data["category_1"] = df_data["product_category_tree"].apply(lambda x :x.split('>>')[0][2:len(x.split(">>")[0])])
df_data["category_1"] = df_data["category_1"].apply(lambda x :x.strip())

df_data["category_2"] = df_data["product_category_tree"].apply(lambda x :x.split('>>')[1])
df_data["category_2"] = df_data["category_2"].apply(lambda x :x.strip())

In [None]:
df_data.head()

In [None]:
print("Unique categories in level 1:", df_data["category_1"].nunique())
print("Unique categories in level 2:", df_data["category_2"].nunique())

In [None]:
barplot_and_pie(df_data["category_1"], "Categories level 1", "Categories")

<div class="alert alert-block alert-warning">
    <p><b>Observations / Conclusions</b></p>
    <p>Based on the plot, we can ask whether we will get 7 clusters in our modeling? </p>
</div>

<div class="alert alert-block alert-info">
    <p>Let's see the record with <b>retail_price</b> missing-value</p>
</div>

In [None]:
barplot_and_pie(df_data["category_2"], "Categories level 2", "Categories")

In [None]:
df_data[df_data["retail_price_log2"].isna()]

<div class="alert alert-block alert-success">
    <p>Let's get to the mean of the <b>retail_price</b> based on the firsts two categories in the tree</p>
</div>

In [None]:
retail_price_mean = df_data[(df_data["category_1"]=="Baby Care") & (df_data["category_2"]=="Baby Bath & Skin")]["retail_price_log2"].mean()
print("retail_price_mean:", retail_price_mean)

<div class="alert alert-block alert-info">
    <p>Let's replace the missing-value into <b>retail_price</b></p>
</div>

In [None]:
df_data["retail_price_log2"].fillna(retail_price_mean, inplace=True)

<div class="alert alert-block alert-info">
    <p>Let's proceed to delete the categories features added</p>
</div>

In [None]:
df_data.drop(columns=["tree_levels", "category_2"], axis=1, inplace=True)

<div class="alert alert-block alert-info">
    <p>Let's analyze the dataset</p>
</div>

In [None]:
df_analysis(df_data, "df_data", type_analysis="complete")

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>At this point, we have the dataset without missing-values.</p>
    <p>Let's proced to make the normalization.</p>
</div>

<div style="background-color: #506AB9;" >
    <h2 style="margin: auto; padding: 20px; color:#fff; ">5. Pre-processing Text data</h2>
</div>

<div class="alert alert-block alert-info">
    <p>We are going to process the following features</p>
    <ol>
        <li>product_name</li>
        <li>product_category_tree</li>
        <li>description</li>
        <li>brand</li>
    </ol>
</div>

In [None]:
text_columns = [
    "product_name", "product_category_tree",
    "description", "brand", "product_specifications"
]

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">5.1. Analyzing the characters in the features</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Before doing the steps above, let's check what type of character we have in the dataset.<br>Then we can use the appropriate tokenizer</p>
</div>

In [None]:
character_type = ["numerical", "special"]

for col in text_columns:
    
    # checking the character in each sample by feature
    df_data[col + "_check"] = df_data[col].apply(lambda x: check_characters(x))
    
    # Normalizing to column, the dict in the sample
    globals()["df_" + col + "_check"] = pd.json_normalize(df_data[col + "_check"])
    
    # Counting the characters uniques by feature
    for val in character_type:
        globals()["dict_" + col + "_" + val] = {}

        for i in globals()["df_" + col + "_check"][val]:
            for j in i:
                if j not in globals()["dict_" + col + "_" + val]:
                    globals()["dict_" + col + "_" + val][j] = 1
                else:
                    globals()["dict_" + col + "_" + val][j] += 1
                    
        # Sorting dict by value descending
        globals()["dict_" + col + "_" + val] = sorted(globals()["dict_" + col + "_" + val].items(),
                                                      key=operator.itemgetter(1), reverse=True)
        globals()["dict_" + col + "_" + val] = {k:v for k, v in globals()["dict_" + col + "_" + val]}
        
        # Creating dataset based on the dict
        if val == "special":
            globals()["df_" + col + "_" + val] = pd.DataFrame({"character" : list(globals()["dict_" + col + "_" + val].keys()),
                                                              "number" : list(globals()["dict_" + col + "_" + val].values())})

<div class="alert alert-block alert-info">
    <p>Plotting the special characters in <b>description</b></p>
</div>

In [None]:
fig = plt.figure(figsize=(10, 7))
plot = sns.barplot(x=df_description_special["character"], y=df_description_special["number"])
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".1f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plt.ylabel("Number", size=12)
plt.xlabel("character", size=12)
plt.title("Special characters in \"description\"", size=16)
plt.tight_layout()
plt.show()

<div class="alert alert-block alert-info">
    <p>Plotting the special characters in <b>product_category_tree</b></p>
</div>

In [None]:
fig = plt.figure(figsize=(10, 7))
plot = sns.barplot(x=df_product_category_tree_special["character"], y=df_product_category_tree_special["number"])
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".1f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plt.ylabel("Number", size=12)
plt.xlabel("character", size=12)
plt.title("Special characters in \"product_category_tree\"", size=16)
plt.tight_layout()
plt.show()

<div class="alert alert-block alert-info">
    <p>Plotting the special characters in <b>product_specifications</b></p>
</div>

In [None]:
fig = plt.figure(figsize=(10, 7))
plot = sns.barplot(x=df_product_specifications_special["character"], y=df_product_specifications_special["number"])
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".1f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plt.ylabel("Number", size=12)
plt.xlabel("character", size=12)
plt.title("Special characters in \"product_specifications\"", size=16)
plt.tight_layout()
plt.show()

<div class="alert alert-block alert-info">
    <p>Printing the digit characters in <b>product_name</b>, <b>product_category_tree</b>, <b>description</b>, <b>brand</b></p>
</div>

In [None]:
for col in text_columns:
    print(col + " has " + str(len(globals()["dict_" + col + "_numerical"])) + " digit used")

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <ul style="list-style-type: square;">
        <li>The 4 features have used numerical characters.</li>
        <li>It seems there are contractions in the text so, it is necessary to check this point.</li>
    </ul>
</div>

<div class="alert alert-block alert-info">
    <p>Deleting the datasets</p>
</div>

In [None]:
for col in text_columns:
    
    del globals()["df_" + col + "_check"]
    del globals()["df_" + col + "_special"]
    
    del df_data[col + "_check"]
        
    gc.collect()

In [None]:
df_data.head()

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">5.1.1. Checking contractions</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's check some contractions in the description</p>
</div>

In [None]:
characters_to_check = ["\'re", "\'d", "\'t"]

In [None]:
df_check_characters = df_data[["description"]].copy()

In [None]:
pd.set_option("display.max_rows", None) # show full of showing rows
pd.set_option("display.max_columns", None) # show full of showing cols
pd.set_option("display.max_colwidth", None) # show full width of showing cols
pd.set_option("display.float_format", lambda x: "%.5f" % x) # show full content in cell    

In [None]:
df_check_characters[df_check_characters["description"].str.contains("|".join(characters_to_check))].tail()

In [None]:
pd.reset_option("display.max_rows") # reset max of showing rows
pd.reset_option("display.max_columns") # reset max of showing cols
pd.reset_option("display.max_colwidth") # reset width of showing cols
pd.reset_option("display.float_format") # reset show full content in cell

<div class="alert alert-block alert-info">
    <p>Deleting the dataset</p>
</div>

In [None]:
del df_check_characters
gc.collect()

<div class="alert alert-block alert-warning">
    <p><b>Observations / Conclusions</b></p>
    <p>At this point, we can see in the text:</p>
    <ul style="list-style-type: square;">
        <li>Characters for new line, tabs, etc..</li>
        <li>There are contractions in the descriptions.</li>
    </ul>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">5.2. Tokenization</h3>
</div>

<div class="alert alert-block alert-info">
    <p>To tokenize the text, we are going to do the following</p>
    <ul style="list-style-type: square;">
        <li>Cleaning up the text</li>
        <li>Remove stop words</li>
    </ul>
    <p>Finally, we are going to compare the results</p>
</div>    
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">5.2.2. Cleaning up the text</h4>
</div>

<div class="alert alert-block alert-info">
    <p>To clean up the text, we are going to remove the following</p>
    <ul style="list-style-type: square;">
        <li>Newlines, tabs, etc.</li>
        <li>HTML tags</li>
        <li>Extra whitespace</li>
        <li>Emails</li>
        <li>Accented characters</li>
        <li>Incorrect characters</li>
        <li>Punctuations</li>
        <li>Non alphabet characters</li>
    </ul>
    <p>Also, we are going to do: </p>
    <ul style="list-style-type: square;">
        <li>To transform to lowercase.</li>
        <li>To expand contractions</li>
    </ul>
</div>    
</div>

<div class="alert alert-block alert-info">
    <p>Reading a english contractions dictionay</p>
</div>

In [None]:
file = open("datasets\english_contractions.txt")
contents = file.read()
english_contractions = ast.literal_eval(contents)
file.close()

<div class="alert alert-block alert-info">
    <p>Let's cleaning the features</p>
</div>

In [None]:
initial_tokens_by_feature, cleaned_words_by_feature = [{} for i in range(2)]
for col in text_columns:
    
    # Tokenization of text without clean
    df_data[col + "_initial"] = df_data[col].apply(lambda x: tokenizer(x))
    initial_tokens_by_feature[col] = df_data[col + "_initial"].explode().dropna().value_counts().shape[0]
    del df_data[col + "_initial"]
    
    # Tokenization of text after cleanning
    if col == "product_specifications":
        df_data[col + "_cleaned"] = df_data[col].apply(lambda x: cleaning_up_product_specifications(x))
        df_data[col + "_cleaned"] = df_data[col + "_cleaned"].apply(lambda x: cleaning_up_text(x, english_contractions))
    else:    
        df_data[col + "_cleaned"] = df_data[col].apply(lambda x: cleaning_up_text(x, english_contractions))
    
    cleaned_words_by_feature[col] = df_data[col + "_cleaned"].explode().dropna().value_counts().shape[0]
    
    
# Sorting dict by value desc
initial_tokens_by_feature = sorted(initial_tokens_by_feature.items(), key=operator.itemgetter(1), reverse=True)
initial_tokens_by_feature = {k:v for k, v in initial_tokens_by_feature}

cleaned_words_by_feature = sorted(cleaned_words_by_feature.items(), key=operator.itemgetter(1), reverse=True)
cleaned_words_by_feature = {k:v for k, v in cleaned_words_by_feature}

In [None]:
pd.set_option("display.max_rows", None) # show full of showing rows
pd.set_option("display.max_columns", None) # show full of showing cols
pd.set_option("display.max_colwidth", None) # show full width of showing cols
pd.set_option("display.float_format", lambda x: "%.5f" % x) # show full content in cell    

In [None]:
df_data.head(1)

In [None]:
pd.reset_option("display.max_rows") # reset max of showing rows
pd.reset_option("display.max_columns") # reset max of showing cols
pd.reset_option("display.max_colwidth") # reset width of showing cols
pd.reset_option("display.float_format") # reset show full content in cell

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">5.2.2. Removing words</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Now, we are going to reduce words based on the following:</p>
    <ul style="list-style-type: square;">
        <li>Stop words</li>
        <li>Non english words</li>
        <li>Keep Nouns</li>
    </ul>    
    <p>It must consider that the mission is not about sentiment analysis, it is about classification</p>
</div>

In [None]:
reduced_words_by_feature = {}
for col in text_columns:
    df_data[col + "_tokens"] = df_data[col + "_cleaned"].apply(lambda x: remove_words(x, "english"))
    reduced_words_by_feature[col] = df_data[col + "_tokens"].explode().dropna().value_counts().shape[0]

    del df_data[col + "_cleaned"]
    
# Sorting dict by value descending
reduced_words_by_feature = sorted(reduced_words_by_feature.items(), key=operator.itemgetter(1), reverse=True)
reduced_words_by_feature = {k:v for k, v in reduced_words_by_feature}

In [None]:
pd.set_option("display.max_rows", None) # show full of showing rows
pd.set_option("display.max_columns", None) # show full of showing cols
pd.set_option("display.max_colwidth", None) # show full width of showing cols
pd.set_option("display.float_format", lambda x: "%.5f" % x) # show full content in cell    

In [None]:
df_data.head(1)

In [None]:
pd.reset_option("display.max_rows") # reset max of showing rows
pd.reset_option("display.max_columns") # reset max of showing cols
pd.reset_option("display.max_colwidth") # reset width of showing cols
pd.reset_option("display.float_format") # reset show full content in cell

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">5.2.3. Compare the results</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's compare the number of words during the all process</p>
</div>

In [None]:
fig = plt.figure(figsize=(15, 4))

# Plot the total words tokenized
sns.set_color_codes("pastel")
plot = sns.barplot(x=list(initial_tokens_by_feature.values()),
                   y=list(initial_tokens_by_feature.keys()), 
                   label="Initial", color="b")

# Plot the words tokenized after cleanning up
sns.set_color_codes("muted")
plot = sns.barplot(x=list(cleaned_words_by_feature.values()),
                   y=list(cleaned_words_by_feature.keys()),
                   label="Intermediate", color="b")

# Plot the words tokenized after reducing stop words
sns.set_color_codes("dark")
plot = sns.barplot(x=list(reduced_words_by_feature.values()),
                   y=list(reduced_words_by_feature.keys()),
                   label="Final", color="b")

plt.legend(ncol=3, loc="lower right", frameon=True)
plt.xlabel("Numbers of tokens", size=12)
plt.ylabel("Features", size=12)
plt.title("Number of tokens during all process", size=16)
plt.tight_layout()
plt.savefig("images/text_analysis/number-of-tokens-during-process.png")
plt.show()

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>Here, we can see how the tokens have reduced during all process, being description the features with more tokens during the process</p>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">5.3. Stemming the tokens</h3>
</div>

<div class="alert alert-block alert-info">
    <p>We use <b>Porter stemming algorithm</b> because it has a less agressive approach in comparison with <b>Plancaster stemming algorithm</b></p>
</div>

In [None]:
for col in text_columns:
    df_data[col + "_stemmed"] = df_data[col + "_tokens"].apply(lambda x: stem_words(x))

In [None]:
pd.set_option("display.max_rows", None) # show full of showing rows
pd.set_option("display.max_columns", None) # show full of showing cols
pd.set_option("display.max_colwidth", None) # show full width of showing cols
pd.set_option("display.float_format", lambda x: "%.5f" % x) # show full content in cell    

In [None]:
df_data[["description", "description_tokens", "description_stemmed"]].head(3)

In [None]:
pd.reset_option("display.max_rows") # reset max of showing rows
pd.reset_option("display.max_columns") # reset max of showing cols
pd.reset_option("display.max_colwidth") # reset width of showing cols
pd.reset_option("display.float_format") # reset show full content in cell@nhurpertuzs

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">5.4. Lemmatization the tokens</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Let's do the lemmatization</p>
</div>

In [None]:
for col in text_columns:
    df_data[col + "_lemma"] = df_data[col + "_tokens"].apply(lambda x: lemma_words(x))

In [None]:
pd.set_option("display.max_rows", None) # show full of showing rows
pd.set_option("display.max_columns", None) # show full of showing cols
pd.set_option("display.max_colwidth", None) # show full width of showing cols
pd.set_option("display.float_format", lambda x: "%.5f" % x) # show full content in cell    

In [None]:
df_data[["description", "description_tokens", "description_stemmed", "description_lemma"]].head(2)

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>At this point, we can see an example of text after doing all process</p>
    <p>It seems that the lemmatization have got better results than stemming</p>
</div>

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">5.5. Text Vectorization</h3>
</div>

<div class="alert alert-block alert-info">
    <p>First of all, let's see all features transformed</p>
</div>

In [None]:
for col in text_columns:
    
    if col == "description":
        head = 2
    else:
        head = 5
    
    print("-"*200)
    print(" >> " + col)
    display(df_data[[col, col + "_tokens", col + "_stemmed", col + "_lemma"]].head(head))
    print("\n")

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>At this point, we can see how the features have been transformed</p>
</div>

In [None]:
pd.reset_option("display.max_rows") # reset max of showing rows
pd.reset_option("display.max_columns") # reset max of showing cols
pd.reset_option("display.max_colwidth") # reset width of showing cols
pd.reset_option("display.float_format") # reset show full content in cell

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">5.5.1. Thresholds lower frequency</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Before doing the <b>Text Vectorization</b>, let's analyse the number of words with lower frequency in each feature, to define the threshold (min_df) to tream them </p>
    <p>To do that we are going to use BoW in default mode and plot the numbers of words with lower frequency</p>
</div>

In [None]:
cv_vectorizer = CountVectorizer()

In [None]:
for col in text_columns:
    
    for var in ["stemmed", "lemma"]:
        
        globals()["bow_" + col + "_" + var] = cv_vectorizer.fit_transform(df_data[col + "_" + var].astype("U"))
        globals()["df_bow_" + col + "_" + var] = pd.DataFrame(globals()["bow_" + col + "_" + var].toarray(),
                                                              columns=cv_vectorizer.get_feature_names_out())
        
        if var == "stemmed":
            palette = "flare"
        else:
            palette = "crest"
        
        # Plottint the results
        most_frequent_words = (globals()["df_bow_" + col + "_" + var].sum(axis=0)).sort_values(ascending=True)
        globals()["df_lower_frequent_words_" + col + "_" + var] = pd.DataFrame.from_dict({"words" : most_frequent_words.index,
                                                                                          "frequency" : most_frequent_words.values})

        lower_frequent_words = globals()["df_lower_frequent_words_" + col + "_" + var].groupby("frequency")["frequency"].count().head(30)

        fig = plt.figure(figsize=(15, 5))
        plot = sns.barplot(x=lower_frequent_words.index, y=lower_frequent_words, palette=palette)
        for p in plot.patches:
            plot.annotate(format(p.get_height(), ".1f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                            ha="center", va="center", xytext=(0, 9), textcoords="offset points")
        plot.set_xticklabels(labels=lower_frequent_words.index, rotation=70, size=12,
                                 horizontalalignment="right")
        plt.ylabel("Number of words", size=12)
        plt.xlabel("Frequency", size=12)
        plt.title("Numbers of words by frequency\n(" + col + " - " + var + ")", size=16)
        plt.tight_layout()
        plt.show()
            
    print("\n\n")


<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>Now, we are going to define the following thresholds (min_df)</p>
    <ul style="list-style-type: square;">
        <li>product_name: min_df=3</li>
        <li>product_category_tree: min_df=3</li>
        <li>description: min_df=5</li>
        <li>brand: min_df=2</li>
        <li>product_specifications:min_df=4</li>
    </ul>  
</div>

In [None]:
text_columns = {
    "product_name": 3,
    "product_category_tree": 3,
    "description": 5,
    "brand": 2,
    "product_specifications": 4
}

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">5.5.2. Bag of Word - BoW</h4>
</div>

In [None]:
pd.set_option("display.max_columns", 25) # show full of showing cols

<div class="alert alert-block alert-info">
    <p>Now, we are going to do a BoW for each feature, considering the stemmed and lemma treatments</p>
    <p>Let's initialize the CountVectorizer based on the min_df defined</p>
</div>

In [None]:
for key, value in text_columns.items():
    
    for var in ["stemmed", "lemma"]:
        
        # Initializing the CountVectorizer based on the min_df defined previously
        globals()["bow_vectorizer_" + key + "_" + var] = CountVectorizer(min_df=value)
        
        globals()["bow_" + key + "_" + var] = globals()["bow_vectorizer_" + key + "_" + var].fit_transform(df_data[key + "_" + var].astype("U"))
        globals()["df_bow_" + key + "_" + var] = pd.DataFrame(globals()["bow_" + key + "_" + var].toarray(),
                                                              columns=globals()["bow_vectorizer_" + key + "_" + var].get_feature_names_out())

<div class="alert alert-block alert-info">
    <p>Printing some resultants datasets</p>
</div>

In [None]:
print("-"*120)
print(" >> product_name_stemmed")
display(df_bow_product_name_stemmed.head())
print(" >> product_name_lemma")
display(df_bow_product_name_lemma.head())
print("\n")
print("-"*120)
print(" >> description_stemmed")
display(df_bow_description_stemmed.head())
print(" >> description_lemma")
display(df_bow_description_lemma.head())

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">5.5.3. Term Frequency - TF-IDF</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Now, we are going to do a TF-IDF for each feature, considering the stemmed and lemma treatments</p>
    <p>Let's initialize the TfidfVectorizer based on the min_df defined</p>
</div>

In [None]:
for key, value in text_columns.items():
    
    for var in ["stemmed", "lemma"]:
        
        # Initializing the TfidfVectorizer based on the min_df defined previously
        globals()["tfidf_vectorizer_" + key + "_" + var] = TfidfVectorizer(min_df=value)
        
        globals()["tfidf_" + key + "_" + var] = globals()["tfidf_vectorizer_" + key + "_" + var].fit_transform(df_data[key + "_" + var].astype("U"))
        globals()["df_tfidf_" + key + "_" + var] = pd.DataFrame(globals()["tfidf_" + key + "_" + var].toarray(),
                                                                columns=globals()["tfidf_vectorizer_" + key + "_" + var].get_feature_names_out())

In [None]:
print("-"*120)
print(" >> product_name_stemmed")
display(df_tfidf_product_name_stemmed.head())
print(" >> product_name_lemma")
display(df_tfidf_product_name_lemma.head())
print("\n")
print("-"*120)
print(" >> description_stemmed")
display(df_tfidf_description_stemmed.head())
print(" >> description_lemma")
display(df_tfidf_description_lemma.head())

<div class="alert alert-block alert-info">
    <p>Printing some resultants datasets</p>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">5.5.4. Compare the results</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's compare the words most frequents based on BoW and TI-IDF</p>
</div>

In [None]:
for col, value in text_columns.items():
    
    for var in ["stemmed", "lemma"]:
        
        for type_of_vector in ["bow", "tfidf"]:
            
            if type_of_vector == "bow":
                palette = "flare"
                temp = "BoW"
            else:
                palette = "crest"
                temp = "TF-IDF"
        
             # Plottint the results
            most_frequent_words = (globals()["df_" + type_of_vector + "_" + col + "_" + var].sum(axis=0)).sort_values(ascending=False).head(30)

            fig = plt.figure(figsize=(15, 5))
            plot = sns.barplot(x=most_frequent_words.index, y=most_frequent_words, palette=palette)
            plot.set_xticklabels(labels=most_frequent_words.index, rotation=70, size=12,
                                     horizontalalignment="right")
            plt.ylabel("count", size=12)
            plt.xlabel("words", size=12)
            plt.title("The 30 words most frequents words in \"" + col + "\" - \"" + var + "\"\n" + temp, size=16)
            plt.tight_layout()
            plt.show()
            
        print("\n\n")

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">5.6. Topic modeling with LDA</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Now, we are going to model the topic for each features through Latent Dirichlet Allocation - LDA</p>
</div>

In [None]:
n_components = df_data["category_1"].nunique()
no_top_words = 10

for col, value in text_columns.items():
    
    for var in ["stemmed", "lemma"]:
        
        for type_of_vector in ["bow", "tfidf"]:
    
            globals()["lda_" + col + "_" + var + "_" + type_of_vector] = LatentDirichletAllocation(n_components=n_components)
            globals()["df_lda_" + col + "_" + var + "_" + type_of_vector] = \
                globals()["lda_" + col + "_" + var + "_" + type_of_vector].fit_transform(globals()["df_" + type_of_vector + "_" + col + "_" + var])
            
            tf_feature_names = globals()[type_of_vector + "_vectorizer_" + col + "_" + var].get_feature_names_out()
            
            
            print("-"*80)
            print(" >>", col.upper(), "-", var.upper(), "-", type_of_vector.upper())
            display_topics(globals()["lda_" + col + "_" + var + "_" + type_of_vector],
                           tf_feature_names, no_top_words)
            print("\n")
            

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">5.7. Concatenating resulting dataset</h3>
</div>

<div class="alert alert-block alert-info">
    <p>From this point, we are going to use <b>the feautures treated with Lemmatization</b></p>
    <p>First of all, we are going to add a prefix on all column based on the feature, just in case we need to identify them later</p>
</div>

In [None]:
dataset_prefix = {
    "product_name": "pn_",
    "product_category_tree": "pct_",
    "description": "d_",
    "brand": "b_",
    "product_specifications": "ps_"
}

In [None]:
for type_of_vector in ["bow", "tfidf"]:

    for col, prefix in dataset_prefix.items():
                
        globals()["df_" + type_of_vector + "_" + col + "_lemma"] = globals()["df_" + type_of_vector + "_" + col + "_lemma"].add_prefix(prefix)

In [None]:
for type_of_vector in ["bow", "tfidf"]:

    for col, prefix in dataset_prefix.items():
        
        for var in ["stemmed", "lemma"]:
                
            globals()["df_" + type_of_vector + "_" + col + "_" + var] = globals()["df_" + type_of_vector + "_" + col + "_" + var].add_prefix(prefix)

<div class="alert alert-block alert-info">
    <p>Concatenating the dataset</p>
</div>

In [None]:
df_stemmed_BoW = pd.concat([df_bow_product_name_stemmed, df_bow_product_category_tree_stemmed, df_bow_description_stemmed,
                          df_bow_brand_stemmed, df_bow_product_specifications_stemmed], axis=1)

df_stemmed_tfidf = pd.concat([df_tfidf_product_name_stemmed, df_tfidf_product_category_tree_stemmed, df_tfidf_description_stemmed,
                          df_tfidf_brand_stemmed, df_tfidf_product_specifications_stemmed], axis=1)

df_lemma_BoW = pd.concat([df_bow_product_name_lemma, df_bow_product_category_tree_lemma, df_bow_description_lemma,
                          df_bow_brand_lemma, df_bow_product_specifications_lemma], axis=1)

df_lemma_tfidf = pd.concat([df_tfidf_product_name_lemma, df_tfidf_product_category_tree_lemma, df_tfidf_description_lemma,
                          df_tfidf_brand_lemma, df_tfidf_product_specifications_lemma], axis=1)

print("-"*40)
print(" >> Stemmed")
print("df_stemmed_BoW shape:\t", df_stemmed_BoW.shape)
print("df_stemmed_tfidf shape:\t", df_stemmed_tfidf.shape)
print("\n")

print("-"*40)
print(" >> Lemma")
print("df_lemma_BoW shape:\t", df_lemma_BoW.shape)
print("df_lemma_tfidf shape:\t", df_lemma_tfidf.shape)
print("\n")


In [None]:
df_stemmed_BoW.head(3)

In [None]:
df_stemmed_tfidf.head(3)

In [None]:
df_lemma_BoW.head(3)

In [None]:
df_lemma_tfidf.head(3)

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>Now, we have 4 dataset based on text (BoW) that we are going to combine with images (BoVW) to clasify them (images)<br>
    Next, the list of dataset so far.</p>
    <ul style="list-style-type: square;">
        <li><b>df_stemmed_BoW</b>:&nbsp;&nbsp;&nbsp;Stemming the tokens + Bag of Word (BoW)</li>
        <li><b>df_stemmed_tfidf</b>:&nbsp;&nbsp;&nbsp;&nbsp;Stemming the tokens + Term Frequency (TF-IDF)</li>
        <li><b>df_lemma_BoW</b>:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Lemmatization the tokens + Bag of Word (BoW)</li>
        <li><b>df_lemma_tfidf</b>:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Lemmatization the tokens + Term Frequency (TF-IDF)</li>
    </ul>  
</div>

<div style="background-color: #506AB9;" >
    <h2 style="margin: auto; padding: 20px; color:#fff; ">6. Pre-processing Images</h2>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">6.1. Analyzing images</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Let's take a look at some images before dealing with them</p>
</div>

In [None]:
n_rows, n_cols = 1, 10

# Grpuping data by category
df_groupby = df_data.groupby("category_1")

for category, sub_df in df_groupby:
    
    #  list to save images size
    image_name, height_size, width_size = [[] for i in range(3)]
    
    # Filtering random images by each category
    images_index_by_category = sub_df.sample(10).index
    
    # Initializing each figure/plot
    fig = plt.figure(figsize=(15, 2))
    plt.suptitle(category, fontweight="bold")
    
    for i, image_index in enumerate(images_index_by_category, 1):
        
        # Identifying an image
        image = df_data["image"].loc[image_index]
        image_name.append(image)
        
        # Reading the image attributes
        img = cv2.imread(ORIGINAL_IMAGES_PATH + image, cv2.IMREAD_UNCHANGED)
        height_size.append(img.shape[0])
        width_size.append(img.shape[1])
        
        # Reading a specific image based on index
        image = mpimg.imread(ORIGINAL_IMAGES_PATH + image)
        
        # Plotting imaga in one row based on category
        ax = fig.add_subplot(n_rows, n_cols, i)        
        ax.imshow(image)
        plt.axis("off")
        
    df_images = pd.DataFrame({
        "Image": image_name,
        "Width": width_size,
        "Height": height_size
    })
    
    plt.show()
    display(df_images)
    print("\n")


<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>At this point, we can conclude </p>
    <ul style="list-style-type: square;">
        <li>It seems that images are good categorized based on the 1º level of the tree of categories</li>
        <li>Most of images are a big size</li>
    </ul>  
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">6.2. Processing images</h3>
</div>

<div class="alert alert-block alert-info">
    <p>We are going to process images as follows:</p>
    <ol>
        <li>Size reduction</li>
        <li>Adjusting contrast and brightness</li>
        <li>Transform to gray</li>
        <li>Noise reduction</li>
    </ol>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.2.1. Size reduction</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's reduce images</p>
</div>

In [None]:
df_data["image"].apply(lambda x: thumbnail_image(x, basewidth=224, path=ORIGINAL_IMAGES_PATH))

<div class="alert alert-block alert-info">
    <p>Let's look at the results</p>
</div>

In [None]:
df_images[["new_width", "new_height"]] = df_images["Image"].apply(lambda x: image_size(x, ORIGINAL_IMAGES_PATH + "thumbnails/")).to_list()
df_images

<div class="alert alert-block alert-success">
    <p>Now, we get the image reduced</p>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.2.2. Adjusting contrast and brightness</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Now, we are going to use the thumbnails created</p>
</div>

In [None]:
df_data["image"].apply(lambda x: contrast_and_brightness(x, path=THUMBNAILS_IMAGES_PATH))

<div class="alert alert-block alert-info">
    <p>Let's look at the results</p>
</div>

In [None]:
show_image_and_histogram("0ec47240feda42c63e42f1e9cee60f7a.jpg", 
                        ORIGINAL_IMAGES_PATH, CB_IMAGES_PATH)

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.2.3. Gray images</h4>
</div>

<div class="alert alert-block alert-info">
    <p>To transform images to gray escals, we are going to use the image with contrast and brightness modified</p>
</div>

In [None]:
df_data["image"].apply(lambda x: gray_image(x, CB_IMAGES_PATH))

<div class="alert alert-block alert-info">
    <p>Let's look at the results</p>
</div>

In [None]:
original_image = cv2.imread(ORIGINAL_IMAGES_PATH + "0ec47240feda42c63e42f1e9cee60f7a.jpg")
gray_image = cv2.imread(GRAY_IMAGES_PATH + "0ec47240feda42c63e42f1e9cee60f7a.jpg")

# Initializing each figure/plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) 

ax1.imshow(original_image)
ax1.set_title("Image with contrast and brightness modified",
              fontsize=14, fontweight="bold")
ax1.grid(None)
ax1.axis("off")

ax2.imshow(gray_image)
ax2.set_title("Gray image", fontsize=14, fontweight="bold")
ax2.grid(None)
ax2.axis("off")

plt.tight_layout()
plt.show()

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.2.4. Noise reduction</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Now, we are going to use the gray images</p>
</div>

In [None]:
df_data["image"].apply(lambda x: noise_reduction(x, GRAY_IMAGES_PATH))

<div class="alert alert-block alert-info">
    <p>Let's look at the results</p>
</div>

In [None]:
gray_image = cv2.imread(GRAY_IMAGES_PATH + "0ec47240feda42c63e42f1e9cee60f7a.jpg")
image_with_noise_reduced = cv2.imread(NR_IMAGES_PATH + "0ec47240feda42c63e42f1e9cee60f7a.jpg")

# Initializing each figure/plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) 

ax1.imshow(gray_image)
ax1.set_title("Gray image", fontsize=14, fontweight="bold")
ax1.grid(None)
ax1.axis("off")

ax2.imshow(image_with_noise_reduced)
ax2.set_title("Image with noise reduced", fontsize=14, fontweight="bold")
ax2.grid(None)
ax2.axis("off")

plt.tight_layout()
plt.show()

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">6.3. SIFT and ORB Visualizations</h3>
</div>

<div class="alert alert-block alert-info">
    <p>We are going to select two images in same category ("Watches") to see the algorithms on them</p>
</div>

In [None]:
image_a = cv2.imread(NR_IMAGES_PATH + "9924fba9b2a738e5a141995952e73104.jpg")
image_b = cv2.imread(NR_IMAGES_PATH + "29b1ca231e10d5269516b80bf9d0dffc.jpg")

<div class="alert alert-block alert-info">
    <p>Let's plot the images</p>
</div>

In [None]:
plot_two_images(image_a, image_b)

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.3.1. SIFT - Scale Invariant Feature Transformation </h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's detect key points and descriptors</p>
</div>

In [None]:
sift = cv2.SIFT_create()

key_points_a_sift, descriptors_a_sift = sift.detectAndCompute(image_a, None)
key_points_b_sift, descriptors_b_sift = sift.detectAndCompute(image_b, None)

image_a_key_points_sift = cv2.drawKeypoints(image_a, key_points_a_sift, None)
image_b_key_points_sift = cv2.drawKeypoints(image_b, key_points_b_sift, None)

<div class="alert alert-block alert-info">
    <p>Let's plot the images</p>
</div>

In [None]:
plot_two_images(image_a_key_points_sift, image_b_key_points_sift)

<div class="alert alert-block alert-info">
    <p>Let's match the images</p>
</div>

In [None]:
#feature matching
bf_macher_sift = cv2.BFMatcher(cv2.NORM_L1, crossCheck=True)
matches_sift = bf_macher_sift.match(descriptors_a_sift, descriptors_b_sift)
matches_sift = sorted(matches_sift, key=lambda x: x.distance)
image_3_sift = cv2.drawMatches(image_a, key_points_a_sift, image_b, key_points_b_sift,
                               matches_sift[:50], None, flags=2)

plt.figure(figsize=(10, 6))
plt.imshow(image_3_sift)
plt.grid(None)
plt.axis("off")
plt.title("Matching images", fontsize=14)
plt.show()

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.3.2. ORB - Oriented Fast and Rotated BRIEF</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's detect key points and descriptors</p>
</div>

In [None]:
orb = cv2.ORB_create()

key_points_a_orb, descriptors_a_orb = orb.detectAndCompute(image_a, None)
key_points_b_orb, descriptors_b_orb = orb.detectAndCompute(image_b, None)

image_a_key_points_orb = cv2.drawKeypoints(image_a, key_points_a_orb, None)
image_b_key_points_orb = cv2.drawKeypoints(image_b, key_points_b_orb, None)

<div class="alert alert-block alert-info">
    <p>Let's plot the images</p>
</div>

In [None]:
plot_two_images(image_a_key_points_orb, image_b_key_points_orb)

<div class="alert alert-block alert-info">
    <p>Let's match the images</p>
</div>

In [None]:
#feature matching
bf_macher_orb = cv2.BFMatcher(cv2.NORM_L1, crossCheck=True)
matches_orb = bf_macher_orb.match(descriptors_a_orb, descriptors_b_orb)
matches_orb = sorted(matches_orb, key=lambda x: x.distance)
image_3_orb = cv2.drawMatches(image_a, key_points_a_orb, image_b, key_points_b_orb,
                              matches_orb[:50], None, flags=2)

plt.figure(figsize=(10, 6))
plt.imshow(image_3_orb)
plt.grid(None)
plt.axis("off")
plt.title("Matching images - ORB", fontsize=14)
plt.show()

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; font-size:16px;">6.3.3. Comparing SIFT / ORB</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's plot the images</p>
</div>

In [None]:
plot_two_images(image_a_key_points_sift, image_a_key_points_orb,
               title_a="SIFT descriptors", title_b="ORB descriptors")

In [None]:
plot_two_images(image_b_key_points_sift, image_b_key_points_orb,
               title_a="SIFT descriptors", title_b="ORB descriptors")

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">6.4. Image Vectorization</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.4.1. Features detection by image</h4>
</div>

<div class="alert alert-block alert-info">
    <p>We are going to detect the images descriptors through SIFT and ORB</p>
</div>

In [None]:
sift = cv2.SIFT_create()
orb = cv2.ORB_create()

In [None]:
detectors = {
    "sift": sift,
    "orb": orb
}

In [None]:
for key, value in detectors.items():
    
    globals()[key + "_desc_by_image"], globals()[key + "_desc_all"] = get_descriptors(df_data["image"],
                                                                                       NR_IMAGES_PATH,
                                                                                       value)

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.4.2. Clustering to build images features</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Now, we are going to use MiniBatchKMeans to build image features</p>
    <p>Determination number of clusters and batch size<br># of cluster is fixed as sqrt of # descriptors</p>
</div>

In [None]:
for key, value in detectors.items():
    
    globals()["k_" + key] = int(round(np.sqrt(len(globals()[key + "_desc_all"])), 0))
    globals()["batch_size_" + key] = df_data["image"].shape[0] * 3
    
    print("-"*25)
    print(" >>", key.upper())
    
    print("Number of clusters:", globals()["k_" + key])
    print("Batch size:", globals()["batch_size_" + key])
    print("\n")

<div class="alert alert-block alert-info">
    <p>Initialize the clusterer with n_clusters=k value and a random generator seed of 10 for reproducibility.</p>
</div>

In [None]:
for key, value in detectors.items():

    globals()[key + "_kmeans"] = MiniBatchKMeans(n_clusters=globals()["k_" + key], init="k-means++", 
                                                 max_iter=1000, batch_size=globals()["batch_size_" + key],
                                                 random_state=10, init_size=3*globals()["k_" + key])

    globals()[key + "_kmeans"].fit(globals()[key + "_desc_all"])

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.4.3. Creation of image features</h4>
</div>

In [None]:
for key, value in detectors.items():
    
    print("-"*10)
    print(" >>", key.upper())
    
    globals()[key + "_img_features"], globals()[key + "_img_features_weighed"] = \
        build_features(globals()[key + "_kmeans"], globals()[key + "_desc_by_image"])
        
    print("\n")

In [None]:
for key, value in detectors.items():
    
    globals()["df_" + key + "_img_features"] = pd.DataFrame(globals()[key + "_img_features"])
    globals()["df_" + key + "_img_features_weighed"] = pd.DataFrame(globals()[key + "_img_features_weighed"])
    
    plt.figure(figsize=(15, 15))
    ax = plt.subplot(311)
    
    ax.set_title("Labels histogram - " + key.upper(), size=20, fontweight="bold")
    ax.set_xlabel("Visual words", size=14)
    ax.set_ylabel("Frequency", size=14)
    
    ax.plot(globals()["df_" + key + "_img_features"][1].ravel())
    
    plt.tight_layout()
    plt.show()
    
    print("\n")
    print(" >>", key.upper(), "(weighed)")
    display(globals()["df_" + key + "_img_features_weighed"].head())
    
    print("\n")
    

<div class="alert alert-block alert-info">
    <p>Now, we are going to add a prefix on all column based on the feature, just in case we need to identify them later</p>
</div>

In [None]:
df_sift_img_features = df_sift_img_features.add_prefix("s_")
df_sift_img_features_weighed = df_sift_img_features_weighed.add_prefix("sw_")

df_orb_img_features = df_orb_img_features.add_prefix("o_")
df_orb_img_features_weighed = df_orb_img_features_weighed.add_prefix("ow_")

In [None]:
df_sift_img_features.head(3)

In [None]:
df_orb_img_features_weighed.head(3)

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>At this points, we have the features based on the images - BoVW.<br> In fact, we have 4 dataset as follows</p>
    <ul style="list-style-type: square;">
        <li><b>df_sift_img_features</b>: BoVW based on SIFT algorithm</li>
        <li><b>df_sift_img_features_weighed</b>: BoVW based on SIFT algorithm and weighed based on the number of descriptors of each image</li>
        <li><b>df_orb_img_features</b>: BoVW based on ORB algorithm</li>
        <li><b>df_orb_img_features_weighed</b>: BoVW based on ORB algorithm and weighed based on the number of descriptors of each image</li>
    </ul>  
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">6.5. BoVW visualization without clusterization</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.5.1. PCA dimension reduction</h4>
</div>

<div class="alert alert-block alert-info">
    <p>We are going to reduce the features dimension through PCA, keeping a high level of explained variance</p>
    <p>But first, let's identify the minimum and maximun value in the dataset.</p>
</div>

In [None]:
for key, value in detectors.items():
    
    minimun = globals()["df_" + key + "_img_features_weighed"].apply(np.min).min()
    maximun = globals()["df_" + key + "_img_features_weighed"].apply(np.max).max()
    
    print("-"*30)
    print(" >>", key.upper())
    print("The minimun value is: ", round(minimun, 3))
    print("The maximun value is: ", round(maximun, 3))
    print("\n")

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <ul style="list-style-type: square;">
        <li>All data is in the same range</li>
        <li>SIFT: the minimun and maximun value are between 0 and 0.58 respectively</li>
        <li>ORB: the minimun and maximun value are between 0 and 1 respectively</li>
    </ul>  
</div>

<div class="alert alert-block alert-info">
    <p>Now, we are going to reduce the features keeping 80% of explained variance</p>
</div>

In [None]:
for key, value in detectors.items():
    
    print("-"*30)
    print(" >>", key.upper())
    print("Dataset dimensions before PCA reduction: ", globals()["df_" + key + "_img_features_weighed"].shape)
    
    globals()["pca_" + key] = PCA(n_components=0.80)
    globals()["df_" + key + "_img_features_pca"] = globals()["pca_" + key].fit_transform(globals()["df_" + key + "_img_features_weighed"])
    
    print("Dataset dimensions after PCA reduction: ", globals()["df_" + key + "_img_features_pca"].shape)
    
    print("\n")

In [None]:
for key, value in detectors.items():
    
    axvline_value = globals()["df_" + key + "_img_features_pca"].shape[1]
    
    globals()["scree_" + key] = globals()["pca_" + key].explained_variance_ratio_*100
    
    fig = plt.subplots(figsize=(8, 5))
    plt.plot(np.arange(len(globals()["scree_" + key]))+1,
             globals()["scree_" + key].cumsum())
    
    plt.xlabel("Number of principal components", size=12)
    plt.ylabel("% of inertia", size=12)
    
    plt.axhline(80, lw=1, c="red")
    plt.text(1, 81, "80%", c="red")
    
    plt.axvline(axvline_value, lw=1, c="red")
    plt.text(axvline_value - 6, 8, axvline_value, c="red")
    
    plt.title("Scree of eigenvalues - " + key.upper(), size=18, fontweight="bold")
    plt.tight_layout()
    plt.show()
    
    print("\n")

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>At this point, we have reduced the data as follows:</p>
    <ul style="list-style-type: square;">
        <li>SIFT: From (1050, 546) to (1050, 131)</li>
        <li>ORB: From (1036, 567) to (1036, 109)</li>
    </ul>  
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">6.5.2. T-SNE dimension reduction</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's reduce the dimension to 2 T-SNE components to visualize in 2D</p>
    <p>At the same time, we are going to add the 1º level of tree category to visualize data based on this feature</p>
</div>

In [None]:
for key, value in detectors.items():
    
    globals()["tsne_" + key] = TSNE(n_components=2, n_iter=2000, learning_rate="auto",
                                    init="random", random_state=10)
    globals()["X_tsne_" + key] = globals()["tsne_" + key].fit_transform(globals()["df_" + key + "_img_features_pca"])
    
    globals()["df_tsne_" + key] = pd.DataFrame(globals()["X_tsne_" + key][:, 0:2], columns=["tsne_1", "tsne_2"])
    globals()["df_tsne_" + key]["category"] = df_data["category_1"]
    
    print("-"*30)
    print(" >>", key.upper())
    print("Dataset dimensions before T-SNE reduction: ", globals()["df_" + key + "_img_features_pca"].shape)
    print("Dataset dimensions after T-SNE reduction: ", globals()["df_tsne_" + key].shape)
    
    print("\n")

In [None]:
for key, value in detectors.items():
    
    plt.figure(figsize=(10, 6))
    
    sns.scatterplot(x="tsne_1", y="tsne_2", hue="category", 
                    data=globals()["df_tsne_" + key], legend="brief",
                    palette=sns.color_palette("tab10", n_colors=7),
                    s=50, alpha=0.6)
    
    plt.title("T-SNE based on 1º level of categories - " + key.upper(), fontsize=18,
              pad=10, fontweight="bold")
    plt.xlabel("tsne_1", fontsize=14, fontweight="bold")
    plt.ylabel("tsne_2", fontsize=14, fontweight="bold")
    plt.legend(bbox_to_anchor=(1, 1), loc="upper left", prop={"size": 12},
              title="Categories")
    
    plt.tight_layout()
    plt.show()
    
    print("\n")

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>We can see that it is not clear the data</p>
</div>

<div style="background-color: #506AB9;" >
    <h2 style="margin: auto; padding: 20px; color:#fff; ">7. Clusterization</h2>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">7.1. Concatenation of features (texts and images)</h3>
</div>

<div class="alert alert-block alert-info">
    <p>From the original dataset, we are going to keep the following values</p>
    <ul style="list-style-type: square;">
        <li>retail_price</li>
    </ul>
    <p>We are going to concatenate data based on the following dataset</p>
    <ul style="list-style-type: square;">
        <li><b>Features based on text data</b></li>
        <ul style="list-style-type: disc;">
            <li><b>df_stemmed_BoW</b>: Stemming the tokens + Bag of Word (BoW)</li>
            <li><b>df_stemmed_tfidf</b>: Stemming the tokens + Term Frequency (TF-IDF)</li>
            <li><b>df_lemma_BoW</b>: Lemmatization the tokens + Bag of Word (BoW)</li>
            <li><b>df_lemma_tfidf</b>: Lemmatization the tokens + Term Frequency (TF-IDF)</li>
        </ul>
        <li><b>Features based on image data</b></li>
        <ul style="list-style-type: disc;">
            <li><b>df_sift_img_features</b>: BoVW based on SIFT algorithm</li>
            <li><b>df_sift_img_features_weighed</b>: BoVW based on SIFT algorithm and weighed based on the number of descriptors of each image</li>
            <li><b>df_orb_img_features</b>: BoVW based on ORB algorithm</li>
            <li><b>df_orb_img_features_weighed</b>: BoVW based on ORB algorithm and weighed based on the number of descriptors of each image</li>
        </ul> 
    </ul>  
        
</div>

<div class="alert alert-block alert-info">
    <p>Defining dataset with <b>only features based on texts</b></p>
</div>

In [None]:
# based on Bag of words
bow_stemmed = df_stemmed_BoW.copy()
bow_lemma = df_lemma_BoW.copy()

# based on TF-IDF
tfidf_stemmed = df_stemmed_tfidf.copy()
tfidf_lemma = df_lemma_tfidf.copy()

<div class="alert alert-block alert-info">
    <p>Defining dataset with <b>features based on texts  + price</b></p>
</div>

In [None]:
# based on Bag of words + price
bow_stemmed_price = pd.merge(df_data[["retail_price_log2"]], bow_stemmed, left_index=True, right_index=True)
bow_lemma_price = pd.merge(df_data[["retail_price_log2"]], bow_lemma, left_index=True, right_index=True)

# based on TF-IDF + price
tfidf_stemmed_price = pd.merge(df_data[["retail_price_log2"]], tfidf_stemmed, left_index=True, right_index=True)
tfidf_lemma_price = pd.merge(df_data[["retail_price_log2"]], tfidf_lemma, left_index=True, right_index=True)

<div class="alert alert-block alert-info">
    <p>Defining dataset with <b>only features based on images</b></p>
</div>

In [None]:
# based on SIFT algorithm
sift = df_sift_img_features.copy()
sift_weighed = df_sift_img_features_weighed.copy()

# based on ORB algorithm
orb = df_orb_img_features.copy()
orb_weighed = df_orb_img_features_weighed.copy()

<div class="alert alert-block alert-info">
    <p>Defining dataset with features <b>based on images + price</b></p>
</div>

In [None]:
# based on SIFT algorithm + price
sift_price = pd.merge(df_data[["retail_price_log2"]], sift, left_index=True, right_index=True)
sift_weighed_price = pd.merge(df_data[["retail_price_log2"]], sift_weighed, left_index=True, right_index=True)

# based on ORB algorithm + price
orb_price = pd.merge(df_data[["retail_price_log2"]], orb, left_index=True, right_index=True)
orb_weighed_price = pd.merge(df_data[["retail_price_log2"]], orb_weighed, left_index=True, right_index=True)

<div class="alert alert-block alert-info">
    <p>Defining dataset with features <b>based on images + BoW stemmed</b></p>
</div>

In [None]:
# based on SIFT algorithm + BoW stemmed
sift_bow_stemmed = pd.merge(sift, df_stemmed_BoW, left_index=True, right_index=True)
sift_weighed_bow_stemmed = pd.merge(sift_weighed, df_stemmed_BoW, left_index=True, right_index=True)
# based on ORB algorithm + BoW stemmed
orb_bow_stemmed = pd.merge(orb, df_stemmed_BoW, left_index=True, right_index=True)
orb_weighed_bow_stemmed = pd.merge(orb_weighed, df_stemmed_BoW, left_index=True, right_index=True)

<div class="alert alert-block alert-info">
    <p>Defining dataset with features <b>based on images + BoW lemma</b></p>
</div>

In [None]:
# based on SIFT algorithm + BoW lemma
sift_bow_lemma = pd.merge(sift, df_lemma_BoW, left_index=True, right_index=True)
sift_weighed_bow_lemma = pd.merge(sift_weighed, df_lemma_BoW, left_index=True, right_index=True)

# based on ORB algorithm + BoW lemma
orb_bow_lemma = pd.merge(orb, df_lemma_BoW, left_index=True, right_index=True)
orb_weighed_bow_lemma = pd.merge(orb_weighed, df_lemma_BoW, left_index=True, right_index=True)

<div class="alert alert-block alert-info">
    <p>Defining dataset with features <b>based on images + TF-IDF stemmed</b></p>
</div>

In [None]:
# based on SIFT algorithm + TF-IDF stemmed
sift_tfidf_stemmed = pd.merge(sift, df_stemmed_tfidf, left_index=True, right_index=True)
sift_weighed_tfidf_stemmed = pd.merge(sift_weighed, df_stemmed_tfidf, left_index=True, right_index=True)

# based on ORB algorithm + TF-IDF stemmed
orb_tfidf_stemmed = pd.merge(orb, df_stemmed_tfidf, left_index=True, right_index=True)
orb_weighed_tfidf_stemmed = pd.merge(orb_weighed, df_stemmed_tfidf, left_index=True, right_index=True)

<div class="alert alert-block alert-info">
    <p>Defining dataset with features <b>based on images + TF-IDF lemma</b></p>
</div>

In [None]:
# based on SIFT algorithm + TF-IDF lemma
sift_tfidf_lemma = pd.merge(sift, df_lemma_tfidf, left_index=True, right_index=True)
sift_weighed_tfidf_lemma =pd.merge(sift_weighed, df_lemma_tfidf, left_index=True, right_index=True)

# based on ORB algorithm + TF-IDF lemma
orb_tfidf_lemma = pd.merge(orb, df_lemma_tfidf, left_index=True, right_index=True)
orb_weighed_tfidf_lemma = pd.merge(orb_weighed, df_lemma_tfidf, left_index=True, right_index=True)

<div class="alert alert-block alert-info">
    <p>Defining dataset with features <b>based on images + price + BoW stemmed</b></p>
</div>

In [None]:
# based on SIFT algorithm + price + BoW stemmed
sift_price_bow_stemmed = pd.merge(sift_price, df_stemmed_BoW, left_index=True, right_index=True)
sift_weighed_price_bow_stemmed = pd.merge(sift_weighed_price, df_stemmed_BoW, left_index=True, right_index=True)

# based on ORB algorithm + price + BoW stemmed
orb_price_bow_stemmed = pd.merge(orb_price, df_stemmed_BoW, left_index=True, right_index=True)
orb_weighed_price_bow_stemmed = pd.merge(orb_weighed_price, df_stemmed_BoW, left_index=True, right_index=True)

<div class="alert alert-block alert-info">
    <p>Defining dataset with features <b>based on images + price + BoW lemma</b></p>
</div>

In [None]:
# based on SIFT algorithm + price + BoW lemma
sift_price_bow_lemma = pd.merge(sift_price, df_lemma_BoW, left_index=True, right_index=True)
sift_weighed_price_bow_lemma = pd.merge(sift_weighed_price, df_lemma_BoW, left_index=True, right_index=True)

# based on ORB algorithm + price + BoW lemma
orb_price_bow_lemma = pd.merge(orb_price, df_lemma_BoW, left_index=True, right_index=True)
orb_weighed_price_bow_lemma = pd.merge(orb_weighed_price, df_lemma_BoW, left_index=True, right_index=True)

<div class="alert alert-block alert-info">
    <p>Defining dataset with features <b>based on images + price + TF-IDF stemmed</b></p>
</div>

In [None]:
# based on SIFT algorithm + price + TF-IDF stemmed
sift_price_tfidf_stemmed = pd.merge(sift_price, df_stemmed_tfidf, left_index=True, right_index=True)
sift_weighed_price_tfidf_stemmed = pd.merge(sift_weighed_price, df_stemmed_tfidf, left_index=True, right_index=True)

# based on ORB algorithm + price + TF-IDF stemmed
orb_price_tfidf_stemmed = pd.merge(orb_price, df_stemmed_tfidf, left_index=True, right_index=True)
orb_weighed_price_tfidf_stemmed = pd.merge(orb_weighed_price, df_stemmed_tfidf, left_index=True, right_index=True)

<div class="alert alert-block alert-info">
    <p>Defining dataset with features <b>based on images + price + TF-IDF lemma</b></p>
</div>

In [None]:
# based on SIFT algorithm + price + TF-IDF lemma
sift_price_tfidf_lemma = pd.merge(sift_price, df_lemma_tfidf, left_index=True, right_index=True)
sift_weighed_price_tfidf_lemma = pd.merge(sift_weighed_price, df_lemma_tfidf, left_index=True, right_index=True)

# based on ORB algorithm + price + TF-IDF lemma
orb_price_tfidf_lemma = pd.merge(orb_price, df_lemma_tfidf, left_index=True, right_index=True)
orb_weighed_price_tfidf_lemma = pd.merge(orb_weighed_price, df_lemma_tfidf, left_index=True, right_index=True)

<div class="alert alert-block alert-info">
    <p>List of datset (combinated) to work on them</p>
</div>

In [None]:
datasets = [
    bow_stemmed, bow_lemma,
    tfidf_stemmed, tfidf_lemma,
    bow_stemmed_price, bow_lemma_price,
    tfidf_stemmed_price, tfidf_lemma_price,
    sift, orb, 
    sift_weighed, orb_weighed,
    sift_price, orb_price,
    sift_weighed_price, orb_weighed_price,
    sift_bow_stemmed, orb_bow_stemmed,
    sift_weighed_bow_stemmed, orb_weighed_bow_stemmed,
    sift_bow_lemma, orb_bow_lemma,
    sift_weighed_bow_lemma, orb_weighed_bow_lemma,
    sift_tfidf_stemmed, orb_tfidf_stemmed,
    sift_weighed_tfidf_stemmed, orb_weighed_tfidf_stemmed,
    sift_tfidf_lemma, orb_tfidf_lemma,
    sift_weighed_tfidf_lemma, orb_weighed_tfidf_lemma,
    sift_price_bow_stemmed, orb_price_bow_stemmed,
    sift_weighed_price_bow_stemmed, orb_weighed_price_bow_stemmed, 
    sift_price_bow_lemma, orb_price_bow_lemma,
    sift_weighed_price_bow_lemma, orb_weighed_price_bow_lemma,
    sift_price_tfidf_stemmed, orb_price_tfidf_stemmed,
    sift_weighed_price_tfidf_stemmed, orb_weighed_price_tfidf_stemmed,
    sift_price_tfidf_lemma, orb_price_tfidf_lemma,
    sift_weighed_price_tfidf_lemma, orb_weighed_price_tfidf_lemma
]

print("Number of combination:", len(datasets))

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>At this point, we have 40 combinations</p>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">7.2. Data preprocessing</h3>
</div>

<div class="alert alert-block alert-info">
    <p>All features are numerican in diferents ranges.<br>At the same time, retail_price doesn't have a normal distribution. So, we are going to use QuantileTransformer to treat the features</p>
</div>

In [None]:
for i in range(len(datasets)):
    
    # Getting columns and index of dataset
    df_columns = datasets[i].columns
    df_index = datasets[i].index
        
    # Getting the dataset name
    name = [x for x in globals() if globals()[x] is datasets[i]][0]
    
    # All features are numerical
    # numerical_pipeline = make_pipeline(StandardScaler())
    numerical_pipeline = make_pipeline(QuantileTransformer(random_state=42, output_distribution="uniform"))

    preprocessor = make_column_transformer(
                (numerical_pipeline, df_columns)
    )
    
    data_scaled = preprocessor.fit_transform(datasets[i])
    globals()[name] = pd.DataFrame(data_scaled, index=df_index, columns=df_columns)
    
    datasets[i] = globals()[name]


<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>Now, the dataset has been preprocesed</p>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">7.3. PCA dimension reduction</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Let's see the number of feature by dataset</p>
</div>

In [None]:
sift_dataset_names, orb_dataset_names, no_imgs_dataset_names = [{} for i in range(3)]

for i in range(len(datasets)):
    
    # Getting the dataset name
    name = [x for x in globals() if globals()[x] is datasets[i]][0]
    
    if "sift" in name :
        sift_dataset_names[name] = datasets[i].shape[1]
    elif "orb" in name :
        orb_dataset_names[name] = datasets[i].shape[1]
    else:
        no_imgs_dataset_names[name] = datasets[i].shape[1]
        

In [None]:
fig = plt.figure(figsize=(8, 7))
plot = sns.barplot(x=list(no_imgs_dataset_names.keys()), y=list(no_imgs_dataset_names.values()))
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".0f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=list(no_imgs_dataset_names.keys()), rotation=70, size=12, horizontalalignment="right")
plt.ylabel("Number of feature", size=12)
plt.xlabel("Dataset", size=12)
plt.title("Number of features by dataset - without image features", size=16, fontweight="bold")
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 8))
plot = sns.barplot(x=list(sift_dataset_names.keys()), y=list(sift_dataset_names.values()))
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".0f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=list(sift_dataset_names.keys()), rotation=70, size=12, horizontalalignment="right")
plt.ylabel("Number of feature", size=12)
plt.xlabel("Dataset", size=12)
plt.title("Number of features by dataset - SIFT", size=16, fontweight="bold")
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 8))
plot = sns.barplot(x=list(orb_dataset_names.keys()), y=list(orb_dataset_names.values()))
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".0f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=list(orb_dataset_names.keys()), rotation=70, size=12, horizontalalignment="right")
plt.ylabel("Number of feature", size=12)
plt.xlabel("Dataset", size=12)
plt.title("Number of features by dataset - ORB", size=16, fontweight="bold")
plt.tight_layout()
plt.show()

<div class="alert alert-block alert-info">
    <p>Let's reduced the dimension through PCA</p>
    <p>We are going to keep only <b>80% of variance</b></p>
</div>

In [None]:
sift_dataset_pca, orb_dataset_pca, no_imgs_dataset_pca = [{} for i in range(3)]

datasets_pca = []

for i in range(len(datasets)):
    
    # Getting the dataset name
    name = [x for x in globals() if globals()[x] is datasets[i]][0]
    
    globals()["pca_" + name] = PCA(n_components=0.80)
    globals()[name + "_pca"] = globals()["pca_" + key].fit_transform(datasets[i])
    
    if "sift" in name :
        sift_dataset_pca[name] = globals()[name + "_pca"].shape[1]
    elif "orb" in name :
        orb_dataset_pca[name] = globals()[name + "_pca"].shape[1]
    else:
        no_imgs_dataset_pca[name] = globals()[name + "_pca"].shape[1]
        
    datasets_pca.append(globals()[name + "_pca"])

In [None]:
fig = plt.figure(figsize=(8, 7))
plot = sns.barplot(x=list(no_imgs_dataset_pca.keys()), y=list(no_imgs_dataset_pca.values()))
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".0f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=list(no_imgs_dataset_pca.keys()), rotation=70, size=12, horizontalalignment="right")
plt.ylabel("Number of feature", size=12)
plt.xlabel("Dataset", size=12)
plt.title("# of features by dataset - No image / After PCA", size=16, fontweight="bold")
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 8))
plot = sns.barplot(x=list(sift_dataset_pca.keys()), y=list(sift_dataset_pca.values()))
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".0f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=list(sift_dataset_pca.keys()), rotation=70, size=12, horizontalalignment="right")
plt.ylabel("Number of feature", size=12)
plt.xlabel("Dataset", size=12)
plt.title("Number of features by dataset - SIFT / after PCA", size=16, fontweight="bold")
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 8))
plot = sns.barplot(x=list(orb_dataset_pca.keys()), y=list(orb_dataset_pca.values()))
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".0f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=list(orb_dataset_pca.keys()), rotation=70, size=12, horizontalalignment="right")
plt.ylabel("Number of feature", size=12)
plt.xlabel("Dataset", size=12)
plt.title("Number of features by dataset - ORB / after PCA", size=16, fontweight="bold")
plt.tight_layout()
plt.show()

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>We can see how the number is feature have been reduced more than a half</p>
</div>

In [None]:
# deleting some dataset and list
for df in datasets:
    
    name = [x for x in globals() if globals()[x] is df][0]
    del globals()[name]
    
    gc.collect()
    
datasets.clear()

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">7.4. T-SNE dimension reduction</h3>
</div>

<div class="alert alert-block alert-info">
    <p>First, we are going to <b>Encode</b> through LabelEncoder the first level of the tree categories </p>
</div>

In [None]:
le = LabelEncoder()
df_data["category_encode"] =df_data[["category_1"]].apply(le.fit_transform)
df_data[["category_1", "category_encode"]].head()

<div class="alert alert-block alert-info">
    <p>Let's reduced the dimension through T-SNE</p>
</div>

In [None]:
datasets_tsne = []

for i in range(len(datasets_pca)):
    
    # Getting the dataset name
    name = [x for x in globals() if globals()[x] is datasets_pca[i]][0]
    
    globals()["tsne_" + name] = TSNE(n_components=3, perplexity=30, 
                                     n_iter=2000, init="random",
                                     random_state=6, learning_rate="auto")
    X_tsne = globals()["tsne_" + name].fit_transform(datasets_pca[i])
    
    globals()[name + "_tsne"] = pd.DataFrame(X_tsne[:, 0:3], columns=["tsne1", "tsne2", "tsne3"])
    globals()[name + "_tsne"]["class_encode"] = df_data["category_encode"]
    globals()[name + "_tsne"]["class"] = df_data["category_1"]
    
    datasets_tsne.append(globals()[name + "_tsne"])

<div class="alert alert-block alert-info">
    <p>Let's look at some datasets afters tsne</p>
</div>

In [None]:
tfidf_stemmed_pca_tsne.head()

In [None]:
orb_weighed_price_tfidf_stemmed_pca_tsne.head()

In [None]:
# deleting some dataset and list
for df in datasets_pca:
    
    name = [x for x in globals() if globals()[x] is df][0]
    del globals()[name]
    
    gc.collect()
    
datasets_pca.clear()

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">7.5. Clusterization</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">7.5.1. KMeans</h4>
</div>

<div class="alert alert-block alert-info">
    <p>The number of cluster based on the first level of the tree categories</p>
</div>

In [None]:
n_clusters = df_data["category_1"].nunique()

<div class="alert alert-block alert-info">
    <p>Let's do the clusterization</p>
</div>

In [None]:
sift_ari, orb_ari, no_img_ari = [{} for i in range(3)]

datasets_cluster = []

for i in range(len(datasets_tsne)):
    
    df_subset = datasets_tsne[i].drop(columns=["class_encode", "class"])
    
    # Getting the dataset name
    name = [x for x in globals() if globals()[x] is datasets_tsne[i]][0]
    name = re.sub("\_pca_tsne$", "", name)
    
    kmeans = KMeans(init="k-means++", n_clusters=n_clusters,
                    max_iter=1000, random_state=10) 
    
    cluster_labels = kmeans.fit_predict(df_subset)
    datasets_tsne[i]["cluster"] = cluster_labels
    
    globals()[name] = datasets_tsne[i].copy()
    
    # Calculating ARI based on the first level of the tree categories
    ari = adjusted_rand_score(datasets_tsne[i]["class_encode"], datasets_tsne[i]["cluster"])
    
    # Saving the results based on the features
    if "sift" in name :
        sift_ari[name] = ari
    elif "orb" in name :
        orb_ari[name] = ari
    else:
        no_img_ari[name] = ari
        
    datasets_cluster.append(globals()[name])
    
# Ordering the results
sift_ari = dict(sorted(sift_ari.items(), key=operator.itemgetter(1), reverse=True))
orb_ari = dict(sorted(orb_ari.items(), key=operator.itemgetter(1), reverse=True))
no_img_ari = dict(sorted(no_img_ari.items(), key=operator.itemgetter(1), reverse=True))

<div class="alert alert-block alert-info">
    <p>Let's look at some datasets afters clusterization</p>
</div>

In [None]:
tfidf_stemmed.head()

In [None]:
orb_weighed_price_tfidf_stemmed.head()

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">7.5.2. Adjusted Rand Score</h4>
</div>

In [None]:
fig = plt.figure(figsize=(8, 7))
plot = sns.barplot(x=list(no_img_ari.keys()), y=list(no_img_ari.values()))
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".3f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=list(no_img_ari.keys()), rotation=70, size=12, horizontalalignment="right")
plt.ylabel("Number of feature", size=12)
plt.xlabel("Dataset", size=12)
plt.title("Adjusted Rand Score", size=16, fontweight="bold")
plt.tight_layout()
plt.show()

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>The datasets based on <b>TF-IDF</b> have gotten the best results compared to <b>BoW</b></p>
</div>

In [None]:
fig = plt.figure(figsize=(10, 8))
plot = sns.barplot(x=list(sift_ari.keys()), y=list(sift_ari.values()))
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".3f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=list(sift_ari.keys()), rotation=70, size=12, horizontalalignment="right")
plt.ylabel("Adjusted Rand Score", size=12)
plt.xlabel("Dataset", size=12)
plt.title("ARI - SIFT", size=16, fontweight="bold")
plt.tight_layout()
plt.show()

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>The best results are distributed in different methodologies used</p>
</div>

In [None]:
fig = plt.figure(figsize=(10, 8))
plot = sns.barplot(x=list(orb_ari.keys()), y=list(orb_ari.values()))
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".3f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=list(orb_ari.keys()), rotation=70, size=12, horizontalalignment="right")
plt.ylabel("Adjusted Rand Score", size=12)
plt.xlabel("Dataset", size=12)
plt.title("ARI - ORB", size=16, fontweight="bold")
plt.tight_layout()
plt.show()

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>The best results are distributed in different methodologies used</p>
</div>

<div class="alert alert-block alert-info">
    <p>We are going to select the two best results from each group</p>
</div>

In [None]:
final_dfs = {}

for dictionary in [no_img_ari, sift_ari, orb_ari]:
    
    for item in islice(dictionary.items(), 2):
        final_dfs[item[0]] = item[1]
        
final_dfs = dict(sorted(final_dfs.items(), key=operator.itemgetter(1), reverse=True))
final_dfs

In [None]:
fig = plt.figure(figsize=(8, 6))
plot = sns.barplot(x=list(final_dfs.keys()), y=list(final_dfs.values()))
for p in plot.patches:
    plot.annotate(format(p.get_height(), ".3f"), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha="center", va="center", xytext=(0, 9), textcoords="offset points")
plot.set_xticklabels(labels=list(final_dfs.keys()), rotation=50, size=12, horizontalalignment="right")
plt.ylabel("Adjusted Rand Score", size=12)
plt.xlabel("Dataset", size=12)
plt.title("Best ARI", size=16, fontweight="bold")
plt.tight_layout()
plt.show()

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>The best results are based on features without images features</p>
    <p>We can suppose that the images had added noise into the datasets to analyze what makes that variance is higher distributed</p>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">7.5.2. Benchmark KMeans</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Now, we are going to do a benchmark among the best results</p>
</div>

In [None]:
results = []
for key, value in final_dfs.items():
    
    df_subset = globals()[key].drop(columns=["class", "class_encode", "cluster"])
    
    kmeans = KMeans(init="k-means++", n_clusters=n_clusters,
                max_iter=1000, random_state=10) 
    
    results.append(benchmark_kmeans(key, kmeans, n_clusters, df_subset))  

header = [
    "dataset", "clusters", "time", "inertia", 
    "calinski-harabasz", "davies-bouldin", "silhouette"
]

print(tabulate(results, header))


<div class="alert alert-block alert-danger">
    <p><b>Observations / Conclusions</b></p>
    <p>The best results are based on features without images features</p>
    <p>We can suppose that the images had added noise into the datasets to analyze what makes that variance is higher distributed</p>
    <ul style="list-style-type: square;">
        <li><b>inertia</b>: 57772.9 (orb_tfidf_lemma)</li>
        <li><b>calinski-harabasz</b>: 1651.08 (tfidf_lemma_price)</li>
        <li><b>davies-bouldin</b>: 0.65823 (tfidf_lemma_price)</li>
        <li><b>silhouette</b>: 0.521855 (tfidf_lemma_price)</li>
    </ul>  
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">7.5.4. Visual analysis</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Let's plot the data based on T-SNE</p>
</div>

In [None]:
for key, value in final_dfs.items():
    
    fig, axes = plt.subplots(1, 2, sharex=True, figsize=(16, 8))
    fig.suptitle(key.upper() + " - ARI score: " + str(round(value, 2)),
                 fontsize=18, fontweight="bold")    

    sns.scatterplot(ax=axes[0], x="tsne1", y="tsne2", hue="class", 
                    data=globals()[key], legend="brief",
                    palette=sns.color_palette("tab10", n_colors=7),
                    s=50, alpha=0.6)
    axes[0].legend(loc="best", prop={"size": 12},
              title="Categories")
    axes[0].set_title("True categories", fontsize=14)

    sns.scatterplot(ax=axes[1], x="tsne1", y="tsne2", hue="cluster", 
                    data=globals()[key], legend="brief",
                    palette=sns.color_palette("tab10", n_colors=7),
                    s=50, alpha=0.6)
    axes[1].legend(loc="best", prop={"size": 12},
              title="Clusters")
    axes[1].set_title("Clusters", fontsize=14)

    plt.tight_layout()
    plt.show()


<div style="background-color: #FF5733;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">--------- FLAG POSITION ---------</h1>
</div>

In [None]:
raise SystemExit("Stop right there!")

<div style="background-color: #FF5733;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">--------- FLAG POSITION ---------</h1>
</div>

In [None]:
# Set up a logs directory, so Tensorboard knows where to look for files.
log_dir='logs/test/'


if not os.path.exists(log_dir):
    os.makedirs(log_dir)

In [None]:
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
    for subwords in encoder.subwords:
        f.write("{}\n".format(subwords))

<div style="background-color: #FF5733;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">--------- FLAG POSITION ---------</h1>
</div>

In [None]:
raise SystemExit("Stop right there!")

In [None]:
# Path
path = "/home/"
 
# Join various path components
print(os.path.join(path, "User/Desktop/", "file.txt"))

<div style="background-color: #FF5733;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">--------- FLAG POSITION ---------</h1>
</div>

In [28]:
LOG_DIR = "logs/" # Path to save the embedding and checkpoints generated

In [29]:
tsne = pd.read_csv(r"temp_datasets\sift_weighed_price_tfidf_lemma.csv")

In [30]:
features = tsne[["tsne1", "tsne2", "tsne3"]].copy()

In [31]:
# np.savetxt(LOG_DIR + "features.txt", features)

In [32]:
features.to_csv(LOG_DIR + "features.txt", sep='\t', index=False, header=False)

In [33]:
md = tsne[["cluster"]].copy()
md.to_csv(LOG_DIR + "metadata.tsv", sep='\t', index=False, header=False)
metadata = os.path.join(LOG_DIR, 'metadata.tsv')

In [34]:
features_vector = np.loadtxt(LOG_DIR + "features.txt")
features_vector

array([[ 56.119587 ,  58.436615 ,   3.8751395],
       [ 39.798027 ,  66.64062  ,  22.980347 ],
       [ 25.765162 ,  55.383446 ,  30.379305 ],
       ...,
       [-42.181313 ,  29.136429 ,  48.90622  ],
       [ 50.663353 , -34.374763 , -49.803753 ],
       [-80.76941  ,   9.1211195,  35.4123   ]])

In [35]:
weights = tf.Variable(features_vector)
weights

<tf.Variable 'Variable:0' shape=(1050, 3) dtype=float64, numpy=
array([[ 56.119587 ,  58.436615 ,   3.8751395],
       [ 39.798027 ,  66.64062  ,  22.980347 ],
       [ 25.765162 ,  55.383446 ,  30.379305 ],
       ...,
       [-42.181313 ,  29.136429 ,  48.90622  ],
       [ 50.663353 , -34.374763 , -49.803753 ],
       [-80.76941  ,   9.1211195,  35.4123   ]])>

In [36]:
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(LOG_DIR, "embedding.ckpt"))

'logs/embedding.ckpt-1'

In [37]:
# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()

In [38]:
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'

# embedding.tensor_name = "test"

In [39]:
# embedding.metadata_path = metadata

In [40]:
projector.visualize_embeddings(LOG_DIR, config)

In [None]:
# Now run tensorboard against on log data we just saved.
%tensorboard --logdir /logs/tf_files/

In [None]:
# creando los archivos a leer
metadata = tsne[["cluster"]].copy()
metadata.to_csv(LOG_DIR + "metadata.tsv", sep='\t', index=False, header=False)

                
features = tsne[["tsne1", "tsne2", "tsne3"]].copy()
features.to_csv(LOG_DIR + "features.txt", sep='\t', index=False, header=False)

In [None]:
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

In [None]:
# tsne = pd.read_csv(r"temp_datasets\sift_weighed_price_tfidf_lemma.csv")
# tsne.drop("Unnamed: 0", axis=1, inplace=True)
# tsne.head(3)

In [None]:
features = os.path.join(LOG_DIR, "features.txt")
print(features)
metadata = os.path.join(LOG_DIR, "metadata.tsv")
print(metadata)

In [None]:
features_vector = np.loadtxt(features)

In [None]:
weights = tf.Variable(features_vector)
weights

In [None]:
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(LOG_DIR, "embedding.ckpt"))

In [None]:
# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()

In [None]:
embedding.tensor_name = "test"


In [None]:
# embedding.metadata_path = "metadata.tsv"
embedding.metadata_path = metadata


In [None]:
projector.visualize_embeddings(LOG_DIR, config)

In [None]:
# Now run tensorboard against on log data we just saved.
%tensorboard --logdir /logs/tf_files/

<div style="background-color: #FF5733;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">--------- IMDB EXAMPLE ---------</h1>
</div>

In [None]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

%load_ext tensorboard

In [None]:
import os
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorboard.plugins import projector

<div style="background-color: #FF5733;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">--------- IMDB EXAMPLE ---------</h1>
</div>

<div style="background-color: #FF5733;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">--------- FLAG POSITION ---------</h1>
</div>

In [None]:
raise SystemExit("Stop right there!")

<div style="background-color: #FF5733;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">--------- FLAG POSITION ---------</h1>
</div>

In [None]:
features = tsne[["tsne1", "tsne2", "tsne3"]].copy()
features.to_csv(LOG_DIR + "features.txt", sep='\t', index=False, header=False)

In [None]:
globals()[key].head()

In [None]:
for key, value in final_dfs.items():
    globals()[key].to_csv("temp_datasets\\" + key + ".csv" )

In [None]:
orb_tfidf_lemma[["tsne1", "tsne2"]].to_csv("test.tsv", sep='\t')

In [None]:
orb_tfidf_lemma

In [None]:
Dataframe.to_csv('/path/to/filename', sep='\t')

In [None]:
import IPython
url = 'https://projector.tensorflow.org/'

IPython.display.IFrame(url, width=1333, height=900)

<div style="background-color: #FF5733;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">--------- FLAG POSITION ---------</h1>
</div>

In [None]:
raise SystemExit("Stop right there!")

<div style="background-color: #FF5733;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">--------- FLAG POSITION ---------</h1>
</div>

In [None]:
import tensorflow as tf
%load_ext tensorboard

In [None]:
%load_ext tensorboard

In [None]:
def get_image_with_tensorflow(path, image):
    
    # reading the image
    image = tf.io.read_file(path + image)
    
    # convert the compressed string to a 3D uint8 tensor
    image = tf.image.decode_jpeg(image, channels=3)
    
    # resize the image to the desired size for your model
    image = tf.image.resize_with_pad(image, 100, 100)
    
    return image

In [None]:
tf.__version__

In [None]:
sam_image = get_image_with_tensorflow(THUMBNAILS_IMAGES_PATH, "0a3b5fdf77a361c2d7d9b29c259b8c4e.jpg")

In [None]:
# Generate embeddings
images_pil = []
images_embeddings = []
labels = []
for ind in tsne.index:
    
