# FashionGen Dataset - Recommender Systems

The objective of this project is to build an apparel recommender system that suggests similar fashion items to users using Natural Language Processing. The dataset is from the FashionGen Challenge which was arranged through the collaboration between SSENSE and ElementAI. 

In [None]:
# import necessary libraries
import h5py
import numpy as np
import pandas as pd

BATCH_SIZE = 500


def get_batch(file_h5, features, batch_number, batch_size=32):
    """Get a batch of the dataset
    
    Args:
        file_h5(str): path of the dataset
        features(list(str)): list of names of features present in the dataset
            that should be returned.
        batch_number(int): the id of the batch to be returned.
        batch_size(int): the mini-batch size
    Returns:
        A list of numpy arrays of the requested features"""
    list_of_arrays = []
    lb, ub = batch_number * batch_size, (batch_number + 1) * batch_size
    for feature in features:
        list_of_arrays.append(file_h5[feature][lb: ub])
    return list_of_arrays


In [None]:
import h5py
f = h5py.File('fashiongen_256_256_train.h5', 'r')

In [None]:
#reading h5py file using pandas 
#a = pd.read_hdf(f)

In [None]:
a = f.get('index')
a

In [None]:
b = np.array(a)
b.shape

In [None]:
type(a)

In [None]:
a[:5]

In [None]:
# check number of dimensions for each array x.ndim
a.ndim

In [None]:
# Output the column names of the dataset 
list(f.keys())

In [None]:
# data type of columns
print("name, data type, number of array dimensions")
for row in list(f.keys()):
    print(str(row) + ": " + str(f[row].dtype) + ": " + str(f[row].ndim))


Need to convert S100, S200, S800, S40 datatypes to a string. Right away, we can see that all the datasets have an extra array dimension that need to be reduced by 1 except index_2.

### Output numpy arrays to images -  Test to see if it works

In [None]:
a = 1
b=1
input2 = ['input_image']
list_of_arrays = get_batch(f, input2, a, b)
a = np.array(list_of_arrays)
#a

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

# outputting picture using matplotlib 
#plt.imshow(a.squeeze())
#plt.show()

import image 
from PIL import Image
c = a.squeeze()
photo = Image.fromarray(c)
photo
# yay it works! 

# Creating dataframe  - excluding input_image - size too large

In [None]:
file_h5 = h5py.File('fashiongen_256_256_train.h5', mode='r')
list_of_features = ['index', 'index_2', 'input_brand', 'input_category', 'input_composition', 'input_concat_description', 'input_department','input_description',
'input_gender', 'input_msrpUSD', 'input_name', 'input_pose', 'input_productID', 'input_season', 'input_subcategory']
#list_of_features = ['index', 'index_2', 'input_brand', 'input_category', 'input_composition', 'input_concat_description', 'input_department','input_description',
# 'input_gender', 'input_image', 'input_msrpUSD', 'input_name', 'input_pose', 'input_productID', 'input_season', 'input_subcategory']

dataset_len = len(file_h5['input_image'])
nb_batches = int(dataset_len / BATCH_SIZE)

batch_nb = np.random.randint(0, nb_batches)

# get the first batch of the data
test = get_batch(file_h5, list_of_features, batch_nb, BATCH_SIZE)
#file_h5.close()


## Create dataframe that includes all the features 

In [None]:
list_of_all_features = ['index', 'index_2', 'input_brand', 'input_category', 'input_composition', 'input_concat_description', 'input_department','input_description',
'input_gender', 'input_image','input_msrpUSD', 'input_name', 'input_pose', 'input_productID', 'input_season', 'input_subcategory'] 
batch_nb1 = 1
#Store h5py file in image 
image = get_batch(file_h5, list_of_all_features, batch_nb1 , BATCH_SIZE)
file_h5.close()

In [None]:
# default encode is ASCII
# decoding - This method is used to convert from one encoding scheme, in which argument string is encoded to the desired encoding scheme
# input_brand 
image[2] = np.char.decode(image[2],"iso-8859-1")
# input_concat_description
image[5] = np.char.decode(image[5],"iso-8859-1")
# input_description
image[7] = np.char.decode(image[7],"iso-8859-1")
# input_name
image[11] = np.char.decode(image[11],"iso-8859-1")

In [None]:
image[2].dtype
dt = np.dtype('<U21')

In [None]:
dt.name

In [None]:
image[5].dtype
dt = np.dtype('<U800')

In [None]:
dt.name

In [None]:
#cp1252 and iso-8859-1 work - https://docs.python.org/2/library/codecs.html#standard-encodings
# utf-8 doesnt work when you use it to decode. https://stackoverflow.com/questions/40388792/how-to-decode-a-numpy-array-of-encoded-literals-strings-in-python3-attributeerr
# why we have to decode instead of encode - https://stackoverflow.com/questions/28947607/ascii-codec-cant-decode-byte-0xe9

In [None]:
# image[5].decode("iso-8859-1") - AttributeError: 'numpy.ndarray' object has no attribute 'decode'
# have to do np.char.decode since it is a numpy array

In [None]:
#for i in range(10):
#    image[5][i] = np.char.decode(image[5][i],"iso-8859-1")
#    print(image[5][i])


## Data Cleaning 

Clean data by reducing the array dimensionality to 1 and change the S datatypes to string. Since we are dealing with French and other non-English characters, convert the bytes data type to a string using a different encoding than the default UTF-8. Instead use, ISO-8859-1 to conver the bytes to a string. 

Error for concat description and description. ASCII is compatible with UTF-8 (8 Bytes) but dataset contains non-ASCII so need to use a different encoding. 
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe9 in position 105: ordinal not in range(128). 
UnicodeDecodeError: 'ascii' codec can't decode byte generally happens when you try to convert a Python 2.x str that contains non-ASCII to a Unicode string without specifying the encoding of the original string.
https://stackoverflow.com/questions/21129020/how-to-fix-unicodedecodeerror-ascii-codec-cant-decode-byte

UTF-8 is one of the most commonly used encodings, and Python often defaults to using it. UTF stands for “Unicode Transformation Format”, and the ‘8’ means that 8-bit values are used in the encoding. (There are also UTF-16 and UTF-32 encodings, but they are less frequently used than UTF-8.) 

In [None]:
# convert input_brand S100 dtype to string
image[2] = image[2].astype(str)
# convert input_category S100 dtype to string
image[3] = image[3].astype(str)
# convert input_composition S200 dtype to string
image[4] = image[4].astype(str)
# convert input_concat_description S800 dtype to string
image[5] = image[5].astype(str)
# convert input_department S100 dtype to string
image[6] = image[6].astype(str)
# convert input_description S400 dtype to string
image[7] = image[7].astype(str)
# convert input_gender S30 dtype to string
image[8] = image[8].astype(str)
# convert input_name S100 dtype to string
image[11] = image[11].astype(str)
# convert input_pose S40 dtype to string
image[12] = image[12].astype(str)
# convert input_season S10 dtype to string
image[14] = image[14].astype(str)
# convert input_subcategory S100 dtype to string
image[15] = image[15].astype(str)

In [None]:
# convert index 2d arrays to 1d arrays 
image[0] = image[0].flatten()
# convert input_brand 2d arrays to 1d arrays 
image[2] = image[2].flatten()
# convert input_category 2d arrays to 1d arrays 
image[3] = image[3].flatten()
# convert input_composition 2d arrays to 1d arrays 
image[4] = image[4].flatten()
# convert input_concat_description 2d arrays to 1d arrays 
image[5] = image[5].flatten()
# convert input_department 2d arrays to 1d arrays 
image[6] = image[6].flatten()
# convert input_description 2d arrays to 1d arrays 
image[7] = image[7].flatten()
# convert input_gender 2d arrays to 1d arrays 
image[8] = image[8].flatten()
# convert input_msrpUSD 2d arrays to 1d arrays 
image[10] = image[10].flatten()
# convert input_name 2d arrays to 1d arrays 
image[11] = image[11].flatten()
# convert input_pose 2d arrays to 1d arrays 
image[12] = image[12].flatten()
# convert input_productID 2d arrays to 1d arrays 
image[13] = image[13].flatten()
# convert input_season 2d arrays to 1d arrays 
image[14] = image[14].flatten()
# convert input_subcategory 2d arrays to 1d arrays 
image[15] = image[15].flatten()

In [None]:
# Other altenatives to convert numpy array to a dataframe
#df = pd.DataFrame(np.concatenate([arr1, arr2, arr3], axis=1), columns= ['a','b','c'])

#column_series = pd.Series(image[2])
#df = df.assign(column_name=column_series)

In [None]:
# Create empty dataframe 
df_image = pd.DataFrame()

#Split up the task due to this error  - AssertionError: Shape of new values must be compatible with manager shape
# add first nine columns to dataframe
for i in range(9):
    # convert numpy array to list 
    df_list = image[i]
    # add new column to a 
    df_image[list_of_all_features[i]] = df_list
# add last six columns to dataframe
for i in range(10,16):
    df_list = image[i]
    # add new column to a 
    df_image[list_of_all_features[i]] = df_list
df_image.head()
#read_hdf

In [None]:
# add input_image to dataframe 
df_list = image[9].tolist()
df_image['input_image'] = df_list
df_image.head()

In [None]:
df_image.shape

# Get rid of duplicates - keep first image of each product 

In [None]:
df_image = df_image.drop_duplicates(subset ="input_productID", 
                     keep = 'first', inplace = False) 

In [None]:
df_image.shape

As we can see, some of the string values above are all in capital letters. We can reformmat these columns to a standard format. 

In [None]:
# format strings from all caps to lowercase and capitalize first letter of each word
df_image['input_category'] = df_image['input_category'].str.lower().str.title()
# format strings from all caps to lowercase and capitalize first letter of each word
df_image['input_department'] = df_image['input_department'].str.lower().str.title()
# format strings from all caps to lowercase and capitalize first letter of each word
df_image['input_subcategory'] = df_image['input_subcategory'].str.lower().str.title()

In [None]:
df_image.head()

In [None]:
df_image.dtypes

In [None]:
df_image.info()

In [None]:
# check for null values
df_image.isnull().sum()

In [None]:
#Check if values in index column are unique 
df_image['index'].is_unique

In [None]:
#Check if values in index_2 column are unique 
df_image['index_2'].is_unique

In [None]:
#Check if values in input_productID column are unique 
df_image['input_productID'].is_unique  

Since there are multiple images for each product stored in the dataset there is not a unique product id for each row. 

## Numpy Array to Image Function 

In [None]:
import scipy.misc
# array is currently stored as an object list in the dataframe
def show_image(list):
    #convert list to a numpy array 
    #Image needs unsigned bytes, convert datatype from object to uint8
    array = np.array(list).astype('uint8')
    return Image.fromarray(array)

In [None]:
for index, row in df_image[:20].iterrows():
    display(show_image(row['input_image']))

In [None]:
#from IPython.display import display
#for i in range(0,5):
#    display(show_image(df_image['input_image'][i]))


# For Input Name

# Text Pre-processing

In [None]:
from nltk.corpus import stopwords

In [None]:
# we use the list of stop words that are downloaded from nltk lib.
stop_words = set(stopwords.words('english'))
print ('list of stop words:', stop_words)

In [None]:
def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        for words in total_text.split():
            # remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            # Conver all letters to lower-case
            word = word.lower()
            # stop-word removal
            if not word in stop_words:
                string += word + " "
        df_image[column][index] = string

In [None]:
import time
start_time = time.clock()
# we take each title and we text-preprocess it.
for index, row in df_image.iterrows():
    nlp_preprocessing(row['input_name'], index, 'input_name')
# we print the time it took to preprocess whole titles 
print(time.clock() - start_time, "seconds")

# Text based product similarity 

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

In [None]:
# Utility Functions which we will use through the rest of the workshop.
def show_image(list,ax,fig):
    #convert list to a numpy array 
    #Image needs unsigned bytes, convert datatype from object to uint8
    array = np.array(list).astype('uint8')
    #Image.fromarray(array)
    plt.imshow(array)
#plt.show()
#Display an image
def display_img(list,ax,fig):
    # we get the url of the apparel and download it
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    # we will display it in notebook 
    plt.imshow(img)
  
#plotting code to understand the algorithm's decision.
def plot_heatmap(keys, values, labels, list_image, text):
        # keys: list of words of recommended title
        # values: len(values) ==  len(keys), values(i) represents the occurence of the word keys(i)
        # labels: len(labels) == len(keys), the values of labels depends on the model we are using
                # if model == 'bag of words': labels(i) = values(i)
                # if model == 'tfidf weighted bag of words':labels(i) = tfidf(keys(i))
                # if model == 'idf weighted bag of words':labels(i) = idf(keys(i))
        # url : apparel's url

        # we will devide the whole figure into two parts
        gs = gridspec.GridSpec(2, 2, width_ratios=[4,1], height_ratios=[4,1]) 
        fig = plt.figure(figsize=(25,3))
        
        # 1st, ploting heat map that represents the count of commonly ocurred words in title2
        ax = plt.subplot(gs[0])
        # it displays a cell in white color if the word is intersection(lis of words of title1 and list of words of title2), in black if not
        ax = sns.heatmap(np.array([values]), annot=np.array([labels]))
        ax.set_xticklabels(keys) # set that axis labels as the words of title
        ax.set_title(text) # apparel title
        
        # 2nd, plotting image of the the apparel
        ax = plt.subplot(gs[1])
        # we don't want any grid lines for image and no labels on x-axis and y-axis
        ax.grid(False)
        ax.set_xticks([])
        ax.set_yticks([])
        
        # we call dispaly_img based with paramete url
        show_image(list_image, ax, fig)
        #display_img(url, ax, fig)
        
        # displays combine figure ( heat map and image together)
        plt.show()
    
def plot_heatmap_image(doc_id, vec1, vec2, list_image, text, model):

    # doc_id : index of the title1
    # vec1 : input apparels's vector, it is of a dict type {word:count}
    # vec2 : recommended apparels's vector, it is of a dict type {word:count}
    # url : apparels image url
    # text: title of recomonded apparel (used to keep title of image)
    # model, it can be any of the models, 
        # 1. bag_of_words
        # 2. tfidf
        # 3. idf

    # we find the common words in both titles, because these only words contribute to the distance between two title vec's
    intersection = set(vec1.keys()) & set(vec2.keys()) 

    # we set the values of non intersecting words to zero, this is just to show the difference in heatmap
    for i in vec2:
        if i not in intersection:
            vec2[i]=0

    # for labeling heatmap, keys contains list of all words in title2
    keys = list(vec2.keys())
    #  if ith word in intersection(lis of words of title1 and list of words of title2): values(i)=count of that word in title2 else values(i)=0 
    values = [vec2[x] for x in vec2.keys()]
    
    # labels: len(labels) == len(keys), the values of labels depends on the model we are using
        # if model == 'bag of words': labels(i) = values(i)
        # if model == 'tfidf weighted bag of words':labels(i) = tfidf(keys(i))
        # if model == 'idf weighted bag of words':labels(i) = idf(keys(i))

    if model == 'bag_of_words':
        labels = values
    elif model == 'tfidf':
        labels = []
        for x in vec2.keys():
            # tfidf_title_vectorizer.vocabulary_ it contains all the words in the corpus
            # tfidf_title_features[doc_id, index_of_word_in_corpus] will give the tfidf value of word in given document (doc_id)
            if x in  tfidf_title_vectorizer.vocabulary_:
                labels.append(tfidf_title_features[doc_id, tfidf_title_vectorizer.vocabulary_[x]])
            else:
                labels.append(0)
    elif model == 'idf':
        labels = []
        for x in vec2.keys():
            # idf_title_vectorizer.vocabulary_ it contains all the words in the corpus
            # idf_title_features[doc_id, index_of_word_in_corpus] will give the idf value of word in given document (doc_id)
            if x in  idf_title_vectorizer.vocabulary_:
                labels.append(idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[x]])
            else:
                labels.append(0)

    plot_heatmap(keys, values, labels, list_image, text)


# this function gets a list of wrods along with the frequency of each 
# word given "text"
def text_to_vector(text):
    word = re.compile(r'\w+')
    words = word.findall(text)
    # words stores list of all words in given string, you can try 'words = text.split()' this will also gives same result
    return Counter(words) # Counter counts the occurence of each word in list, it returns dict type object {word1:count}



def get_result(doc_id, content_a, content_b, list_image, model):
    text1 = content_a
    text2 = content_b
    
    # vector1 = dict{word11:#count, word12:#count, etc.}
    vector1 = text_to_vector(text1)

    # vector1 = dict{word21:#count, word22:#count, etc.}
    vector2 = text_to_vector(text2)

    plot_heatmap_image(doc_id, vector1, vector2, list_image, text2, model)

# Bag of Words

In [None]:
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout

plotly.offline.init_notebook_mode(connected=True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
title_vectorizer = CountVectorizer()
title_features   = title_vectorizer.fit_transform(df_image['input_name'])
title_features.get_shape() # get number of rows and columns in feature matrix.
# title_features.shape = #data_points * #words_in_corpus
# CountVectorizer().fit_transform(corpus) returns 
# the a sparase matrix of dimensions #data_points * #words_in_corpus

# What is a sparse vector?

# title_features[doc_id, index_of_word_in_corpus] = number of times the word occured in that doc


In [None]:
def bag_of_words_model(doc_id, num_results):
    # doc_id: apparel's id in given corpus
    
    # pairwise_dist will store the distance from given input apparel to all remaining apparels
    # the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
    # http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
    pairwise_dist = pairwise_distances(title_features,title_features[doc_id])
    
    # np.argsort will return indices of the smallest distances
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    #pdists will store the smallest distances
    pdists  = np.sort(pairwise_dist.flatten())[0:num_results]

    #data frame indices of the 9 smallest distace's
    df_indices = list(df_image.index[indices])
    
    for i in range(0,len(indices)):
        # we will pass 1. doc_id, 2. title1, 3. title2, url, model
        get_result(indices[i],df_image['input_name'].loc[df_indices[0]], df_image['input_name'].loc[df_indices[i]], df_image['input_image'].loc[df_indices[i]], 'bag_of_words')
        print('ProductID :',df_image['input_productID'].loc[df_indices[i]])
        print ('Brand:', df_image['input_brand'].loc[df_indices[i]])
        print ('Title:', df_image['input_name'].loc[df_indices[i]])
        print ('Euclidean similarity with the query image :', pdists[i])
        print('='*60)

#call the bag-of-words model for a product to get similar products.
bag_of_words_model(0, 20) # change the index if you want to.
# In the output heat map each value represents the count value 
# of the label word, the color represents the intersection 
# with inputs title.

#try 12566
#try 931

#  TF-IDF based product similarity

In [None]:
tfidf_title_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_title_features = tfidf_title_vectorizer.fit_transform(df_image['input_name'])
# tfidf_title_features.shape = #data_points * #words_in_corpus
# CountVectorizer().fit_transform(courpus) returns the a sparase matrix of dimensions #data_points * #words_in_corpus
# tfidf_title_features[doc_id, index_of_word_in_corpus] = tfidf values of the word in given doc

In [None]:
def tfidf_model(doc_id, num_results):
    # doc_id: apparel's id in given corpus
    
    # pairwise_dist will store the distance from given input apparel to all remaining apparels
    # the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
    # http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
    pairwise_dist = pairwise_distances(tfidf_title_features,tfidf_title_features[doc_id])

    # np.argsort will return indices of 9 smallest distances
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    #pdists will store the 9 smallest distances
    pdists  = np.sort(pairwise_dist.flatten())[0:num_results]

    #data frame indices of the 9 smallest distace's
    df_indices = list(df_image.index[indices])

    for i in range(0,len(indices)):
        # we will pass 1. doc_id, 2. title1, 3. title2, url, model
        get_result(indices[i],df_image['input_name'].loc[df_indices[0]], df_image['input_name'].loc[df_indices[i]], df_image['input_image'].loc[df_indices[i]], 'tfidf')
        print('ProductID :',df_image['input_productID'].loc[df_indices[i]])
        print ('Brand:', df_image['input_brand'].loc[df_indices[i]])
        print ('Title:', df_image['input_name'].loc[df_indices[i]])
        print ('Euclidean similarity with the query image :', pdists[i])
        print('='*125)
tfidf_model(1, 10)
# in the output heat map each value represents the tfidf values of the label word, the color represents the intersection with inputs title

# Descriptive Analysis

In [None]:
df_image.shape

The dataset has 100,000 rows and 15 columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
sns.set(color_codes=True)

### input_brand

In [None]:
df_image['input_brand'].describe()
#plot distribution

In [None]:
df_image['input_brand'].value_counts()

### input_category

In [None]:
df_image['input_category'].describe()

In [None]:
df_image['input_category'].value_counts()

sns.distplot(df_image['input_category'].value_counts())

### input_composition

In [None]:
df_image['input_composition'].describe()

In [None]:
df_image['input_composition'].value_counts()

### input_concat_description

In [None]:
df['input_concat_description'].describe()

In [None]:
df['input_concat_description'].value_counts()

### input_department

In [None]:
df['input_department'].describe()

In [None]:
df['input_department'].value_counts()

### input_description

In [None]:
df['input_description'].describe()

In [None]:
df['input_description'].value_counts()

### input_gender

In [None]:
df['input_gender'].describe()

In [None]:
df['input_gender'].value_counts()

### input_msrpUSD

In [None]:
df['input_msrpUSD'].describe()

In [None]:
df_image['input_msrpUSD'].value_counts()
sns.distplot(df_image['input_msrpUSD'])

In [None]:
sns.distplot(df_image['input_msrpUSD'], kde=False, rug=True)

### input_name

In [None]:
df_image['input_name'].describe()

In [None]:
df_image['input_name'].value_counts()

### input_pose

In [None]:
df['input_pose'].describe()

In [None]:
df['input_pose'].value_counts()

### input_productID

In [None]:
df['input_productID'].describe()

In [None]:
df['input_productID'].value_counts()

### input_season

In [None]:
df_image['input_season'].describe()

In [None]:
df_image['input_season'].value_counts()

### input_subcategory

In [None]:
df['input_subcategory'].describe()

In [None]:
df['input_subcategory'].value_counts()

### Wordcloud for Input Descriptions 

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 200,
        max_font_size = 40, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.show()
    
# print wordcloud
show_wordcloud(df_image["input_description"])

# command prompt - python -m pip install wordcloud

### Wordcloud for Input Concat Descriptions 

In [None]:
show_wordcloud(df_image["input_concat_description"])

### Wordcloud for Input Name 

In [None]:
show_wordcloud(df_image["input_name"])