In [22]:
import pandas as pd
from tqdm.autonotebook import tqdm
tqdm.pandas()

import os
import re
import string

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist

  
  from pandas import Panel


## Import Data

In [23]:
rows = []
with open("data/styles.csv", 'r') as f:
    keys = next(f).split(',')
    for line in f:
        rows.append(dict(zip(keys, line.split(',', 10))))

In [24]:
df = pd.DataFrame(rows)
df = df.rename(columns={"productDisplayName\n": "title"})
df.title = df.title.apply(lambda x: x.strip())

In [25]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,title
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012,Casual,Puma Men Grey T-shirt


In [26]:
titles = df.title

## Preprocess Title

### Lowercase

In [27]:
titles = titles.str.lower()

### Remove Numbers 

In [28]:
titles = titles.str.replace('\d+', '')

### Remove Punctuations Except '-'

In [29]:
titles = titles.str.replace(r'[^a-z0-9-]+', ' ')

### Tokenize

In [30]:
tokenized = titles.apply(lambda title: title.split())

In [31]:
tokenized.head(10)

0              [turtle, check, men, navy, blue, shirt]
1            [peter, england, men, party, blue, jeans]
2                        [titan, women, silver, watch]
3    [manchester, united, men, solid, black, track,...
4                           [puma, men, grey, t-shirt]
5           [inkfruit, mens, chain, reaction, t-shirt]
6               [fabindia, men, striped, green, shirt]
7                      [jealous, women, purple, shirt]
8                         [puma, men, pack, of, socks]
9                          [skagen, men, black, watch]
Name: title, dtype: object

### Remove '-' If Only '-', '-...', or '...-'

In [32]:
def remove_hyphen(tokens):
    for i, token in enumerate(tokens):
        if token[0] == '-' or token[-1] == '-':
            tokens[i] = token.replace('-', '')
    
    return tokens

tokenized = tokenized.apply(remove_hyphen)

### Remove One Letter Token and Empty Token

In [33]:
tokenized = tokenized.apply(lambda tokens: [token for token in tokens if len(token) > 1])

In [34]:
tokenized.head(10)

0              [turtle, check, men, navy, blue, shirt]
1            [peter, england, men, party, blue, jeans]
2                        [titan, women, silver, watch]
3    [manchester, united, men, solid, black, track,...
4                           [puma, men, grey, t-shirt]
5           [inkfruit, mens, chain, reaction, t-shirt]
6               [fabindia, men, striped, green, shirt]
7                      [jealous, women, purple, shirt]
8                         [puma, men, pack, of, socks]
9                          [skagen, men, black, watch]
Name: title, dtype: object

### Remove Stopwords

Danger! Try not to remove the stopwrods.
Refer to this link: https://www.crowdcontent.com/resources/writer/university/seo-keywords/stop-words/ and this https://towardsdatascience.com/why-you-should-avoid-removing-stopwords-aa7a353d2a52

In [51]:
# Read file containing stopwords
stopwords_list = []
with open('product_stopwords.txt') as f:
    stopwords_list = f.readlines()

stopwords_list = [word.strip() for word in stopwords_list]

In [52]:
# Stopwords from NLTK
stopwords_list += stopwords.words('indonesian')
stopwords_list += stopwords.words('english')
stopwords_list = set(stopwords_list)

In [53]:
tokenized = tokenized.apply(lambda tokens: [word for word in tokens if word not in stopwords_list])

In [54]:
tokenized.head(10)

0                   [baju, tidur, pria, piyama, satin]
1                      [baju, tidur, piyama, pp, pria]
2                  [stelan, baju, tidur, pria, piyama]
3    [premium, quality, baju, tidur, pria, piyama, ...
4                   [baju, tidur, salur, piyama, pria]
5    [merah, guava, kaos, kerah, cowok, country, fi...
6         [baju, tidur, wanita, piyama, hotpan, motif]
7                      [longjohn, pria, abu, abu, tua]
8    [longjohn, pria, baju, musim, dingin, pria, ab...
9          [piyama, baju, tidur, dewasa, pria, lengan]
Name: title, dtype: object

### Check Tokens' Frequency

In [35]:
tokens_frequency = FreqDist()
for tokens in tokenized:
    tokens_frequency += FreqDist(tokens)

In [40]:
len(tokens_frequency.most_common())

7243

In [41]:
list_token = [token for token, freq in tokens_frequency.most_common()]

In [43]:
pd.DataFrame(list_token).to_csv("data/token_list", index=False)

### Save

In [36]:
df['tokenized'] = tokenized

In [37]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,title,tokenized
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011,Casual,Turtle Check Men Navy Blue Shirt,"[turtle, check, men, navy, blue, shirt]"
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012,Casual,Peter England Men Party Blue Jeans,"[peter, england, men, party, blue, jeans]"
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016,Casual,Titan Women Silver Watch,"[titan, women, silver, watch]"
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011,Casual,Manchester United Men Solid Black Track Pants,"[manchester, united, men, solid, black, track,..."
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012,Casual,Puma Men Grey T-shirt,"[puma, men, grey, t-shirt]"


In [38]:
df.to_csv('data/styles_with_token.csv', index=False)

## Add Image Name to Data

In [39]:
import pandas as pd
from tqdm.autonotebook import tqdm
tqdm.pandas()

import os

  from pandas import Panel


### Import Data

In [40]:
df = pd.read_csv("data/styles_with_token.csv")

In [41]:
# sort df by id
df = df.sort_values(by=['id'])

In [42]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,title,tokenized
12904,1163,Men,Apparel,Topwear,Tshirts,Blue,Summer,2011.0,Sports,Nike Sahara Team India Fanwear Round Neck Jersey,"['nike', 'sahara', 'team', 'india', 'fanwear',..."
12627,1164,Men,Apparel,Topwear,Tshirts,Blue,Winter,2015.0,Sports,Nike Men Blue T20 Indian Cricket Jersey,"['nike', 'men', 'blue', 'indian', 'cricket', '..."
16357,1165,Men,Apparel,Topwear,Tshirts,Blue,Summer,2013.0,Sports,Nike Mean Team India Cricket Jersey,"['nike', 'mean', 'team', 'india', 'cricket', '..."
9208,1525,Unisex,Accessories,Bags,Backpacks,Navy Blue,Fall,2010.0,Casual,Puma Deck Navy Blue Backpack,"['puma', 'deck', 'navy', 'blue', 'backpack']"
33222,1526,Unisex,Accessories,Bags,Backpacks,Black,Fall,2010.0,Sports,Puma Big Cat Backpack Black,"['puma', 'big', 'cat', 'backpack', 'black']"


In [43]:
len(df)

44446

In [44]:
image_names = os.listdir("data/images")

In [45]:
len(image_names)

44441

### Remove Data That Doesn't Have Image

In [46]:
id_name_dict = {}

for image_name in image_names:
    id_name_dict[image_name.split('.')[0]] = image_name

In [47]:
ids_with_image = list(id_name_dict.keys())

In [48]:
df = df[df.id.isin(ids_with_image)]

In [49]:
len(df)

44441

In [50]:
file_names = []
for file_id in df['id']:
    file_names.append(id_name_dict[str(file_id)])

### Save

In [51]:
df['file_name'] = file_names

In [52]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,title,tokenized,file_name
12904,1163,Men,Apparel,Topwear,Tshirts,Blue,Summer,2011.0,Sports,Nike Sahara Team India Fanwear Round Neck Jersey,"['nike', 'sahara', 'team', 'india', 'fanwear',...",1163.jpg
12627,1164,Men,Apparel,Topwear,Tshirts,Blue,Winter,2015.0,Sports,Nike Men Blue T20 Indian Cricket Jersey,"['nike', 'men', 'blue', 'indian', 'cricket', '...",1164.jpg
16357,1165,Men,Apparel,Topwear,Tshirts,Blue,Summer,2013.0,Sports,Nike Mean Team India Cricket Jersey,"['nike', 'mean', 'team', 'india', 'cricket', '...",1165.jpg
9208,1525,Unisex,Accessories,Bags,Backpacks,Navy Blue,Fall,2010.0,Casual,Puma Deck Navy Blue Backpack,"['puma', 'deck', 'navy', 'blue', 'backpack']",1525.jpg
33222,1526,Unisex,Accessories,Bags,Backpacks,Black,Fall,2010.0,Sports,Puma Big Cat Backpack Black,"['puma', 'big', 'cat', 'backpack', 'black']",1526.jpg


In [53]:
df.to_csv('data/styles_with_token.csv', index=False)

## Check for Invalid Image

In [73]:
import os

import pandas as pd
from tqdm.autonotebook import tqdm
tqdm.pandas()

from PIL import Image

  from pandas import Panel


In [74]:
df = pd.read_csv("data/styles_with_token.csv")

In [75]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,title,tokenized,file_name
0,1163,Men,Apparel,Topwear,Tshirts,Blue,Summer,2011.0,Sports,Nike Sahara Team India Fanwear Round Neck Jersey,"['nike', 'sahara', 'team', 'india', 'fanwear',...",1163.jpg
1,1164,Men,Apparel,Topwear,Tshirts,Blue,Winter,2015.0,Sports,Nike Men Blue T20 Indian Cricket Jersey,"['nike', 'men', 'blue', 'indian', 'cricket', '...",1164.jpg
2,1165,Men,Apparel,Topwear,Tshirts,Blue,Summer,2013.0,Sports,Nike Mean Team India Cricket Jersey,"['nike', 'mean', 'team', 'india', 'cricket', '...",1165.jpg
3,1525,Unisex,Accessories,Bags,Backpacks,Navy Blue,Fall,2010.0,Casual,Puma Deck Navy Blue Backpack,"['puma', 'deck', 'navy', 'blue', 'backpack']",1525.jpg
4,1526,Unisex,Accessories,Bags,Backpacks,Black,Fall,2010.0,Sports,Puma Big Cat Backpack Black,"['puma', 'big', 'cat', 'backpack', 'black']",1526.jpg


In [80]:
file_names = df['file_name'].values

In [81]:
file_names = ["data/images/" + file_name for file_name in file_names]

In [85]:
for file_name in file_names:
    img = Image.open(file_name)
    if img.size != (60, 80):
        print(img.size)
    del img

(60, 60)
(60, 60)
(60, 60)
(60, 60)
(60, 60)
(60, 77)
(60, 77)
(60, 76)
(53, 80)
(53, 80)
(53, 80)
(53, 80)
(53, 80)
(53, 80)
(60, 75)
(53, 80)
(53, 80)
(53, 80)
(60, 79)
(53, 80)
(60, 79)
(54, 80)
(54, 80)


### Save

In [14]:
df['file_name'] = file_names

In [15]:
df.head()

Unnamed: 0,file_id,title,img_url,category,gender,done,tokenized,file_name
0,50000,baju tidur pria piyama satin,https://s1.bukalapak.com/img/18182389021/s-194...,piyama-baju-tidur-2657,M,True,"['baju', 'tidur', 'pria', 'piyama', 'satin']",50000.png
1,50001,baju tidur import piyama pp pria,https://s1.bukalapak.com/img/65651610211/s-194...,piyama-baju-tidur-2657,M,True,"['baju', 'tidur', 'piyama', 'pp', 'pria']",50001.png
2,50002,stelan baju tidur pria piyama import,https://s0.bukalapak.com/img/55768558031/s-194...,piyama-baju-tidur-2657,M,True,"['stelan', 'baju', 'tidur', 'pria', 'piyama']",50002.png
3,50003,premium quality baju tidur pria piyama salur,https://s1.bukalapak.com/img/17182586211/s-194...,piyama-baju-tidur-2657,M,True,"['premium', 'quality', 'baju', 'tidur', 'pria'...",50003.png
4,50004,baju tidur import salur piyama pria,https://s1.bukalapak.com/img/62718914511/s-194...,piyama-baju-tidur-2657,M,True,"['baju', 'tidur', 'salur', 'piyama', 'pria']",50004.png


In [16]:
df.to_csv('../data/product_preprocessed.csv', index=False)

## Add '\<start\>' and '\<end\>' to Token and Remove Unnecessary Columns

In [5]:
import ast

import pandas as pd

In [7]:
start_token = '<start>'
end_token = '<end>'

In [2]:
df = pd.read_csv("data/styles_with_token.csv")

In [3]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,title,tokenized,file_name
0,1163,Men,Apparel,Topwear,Tshirts,Blue,Summer,2011.0,Sports,Nike Sahara Team India Fanwear Round Neck Jersey,"['nike', 'sahara', 'team', 'india', 'fanwear',...",1163.jpg
1,1164,Men,Apparel,Topwear,Tshirts,Blue,Winter,2015.0,Sports,Nike Men Blue T20 Indian Cricket Jersey,"['nike', 'men', 'blue', 'indian', 'cricket', '...",1164.jpg
2,1165,Men,Apparel,Topwear,Tshirts,Blue,Summer,2013.0,Sports,Nike Mean Team India Cricket Jersey,"['nike', 'mean', 'team', 'india', 'cricket', '...",1165.jpg
3,1525,Unisex,Accessories,Bags,Backpacks,Navy Blue,Fall,2010.0,Casual,Puma Deck Navy Blue Backpack,"['puma', 'deck', 'navy', 'blue', 'backpack']",1525.jpg
4,1526,Unisex,Accessories,Bags,Backpacks,Black,Fall,2010.0,Sports,Puma Big Cat Backpack Black,"['puma', 'big', 'cat', 'backpack', 'black']",1526.jpg


In [6]:
# convert tokens from stringified list back to list
df.tokenized = df.tokenized.apply(lambda tokens: ast.literal_eval(tokens))

In [8]:
df.tokenized = df.tokenized.apply(lambda x: [start_token] + x + [end_token])

In [10]:
df.tokenized.head()

0    [<start>, nike, sahara, team, india, fanwear, ...
1    [<start>, nike, men, blue, indian, cricket, je...
2    [<start>, nike, mean, team, india, cricket, je...
3    [<start>, puma, deck, navy, blue, backpack, <e...
4    [<start>, puma, big, cat, backpack, black, <end>]
Name: tokenized, dtype: object

In [19]:
df = df.drop(["gender", "season", "year", "usage"], axis=1, errors='ignore')

In [20]:
df.head()

Unnamed: 0,id,masterCategory,subCategory,articleType,baseColour,title,tokenized,file_name
0,1163,Apparel,Topwear,Tshirts,Blue,Nike Sahara Team India Fanwear Round Neck Jersey,"[<start>, nike, sahara, team, india, fanwear, ...",1163.jpg
1,1164,Apparel,Topwear,Tshirts,Blue,Nike Men Blue T20 Indian Cricket Jersey,"[<start>, nike, men, blue, indian, cricket, je...",1164.jpg
2,1165,Apparel,Topwear,Tshirts,Blue,Nike Mean Team India Cricket Jersey,"[<start>, nike, mean, team, india, cricket, je...",1165.jpg
3,1525,Accessories,Bags,Backpacks,Navy Blue,Puma Deck Navy Blue Backpack,"[<start>, puma, deck, navy, blue, backpack, <e...",1525.jpg
4,1526,Accessories,Bags,Backpacks,Black,Puma Big Cat Backpack Black,"[<start>, puma, big, cat, backpack, black, <end>]",1526.jpg


In [21]:
df.to_csv('data/styles_with_token.csv', index=False)