In [3]:
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re 
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack

import chart_studio
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.figure_factory as FF
import plotly.io as pio
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

warnings.filterwarnings("ignore")

In [4]:
data=pd.read_json("tops_fashion.json")
data=data[['brand','color','medium_image_url','product_type_name','title','formatted_price']]
data.head()

Unnamed: 0,brand,color,medium_image_url,product_type_name,title,formatted_price
0,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Minions Como Superheroes Ironman Long Sleeve R...,
1,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Izo Tunic,
2,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Won Top,
3,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,
4,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,Featherlite Ladies' Long Sleeve Stain Resistan...,$26.26


# Missing data for various features.

# product_type_name	

In [8]:
data['product_type_name'].describe()

count     183138
unique        72
top        SHIRT
freq      167794
Name: product_type_name, dtype: object

In [11]:
'''For 183,138 products we have 72 unique product type names.
Out of 72 product names, "SHIRT" is most frequent
frequency(SHIRT) = 167,794
% (SHIRT)=(167794/183138)*100=91.62% '''


'For 183,138 products we have 72 unique product type names.\nOut of 72 product names, "SHIRT" is most frequent\nfrequency(SHIRT) = 167,794\n% (SHIRT)=(167794/183138)*100=91.62% '

In [12]:
#name of unique/different product types
data['product_type_name'].unique()

array(['SHIRT', 'SWEATER', 'APPAREL', 'OUTDOOR_RECREATION_PRODUCT',
       'BOOKS_1973_AND_LATER', 'PANTS', 'HAT', 'SPORTING_GOODS', 'DRESS',
       'UNDERWEAR', 'SKIRT', 'OUTERWEAR', 'BRA', 'ACCESSORY',
       'ART_SUPPLIES', 'SLEEPWEAR', 'ORCA_SHIRT', 'HANDBAG',
       'PET_SUPPLIES', 'SHOES', 'KITCHEN', 'ADULT_COSTUME',
       'HOME_BED_AND_BATH', 'MISC_OTHER', 'BLAZER',
       'HEALTH_PERSONAL_CARE', 'TOYS_AND_GAMES', 'SWIMWEAR',
       'CONSUMER_ELECTRONICS', 'SHORTS', 'HOME', 'AUTO_PART',
       'OFFICE_PRODUCTS', 'ETHNIC_WEAR', 'BEAUTY',
       'INSTRUMENT_PARTS_AND_ACCESSORIES', 'POWERSPORTS_PROTECTIVE_GEAR',
       'SHIRTS', 'ABIS_APPAREL', 'AUTO_ACCESSORY', 'NONAPPARELMISC',
       'TOOLS', 'BABY_PRODUCT', 'SOCKSHOSIERY',
       'POWERSPORTS_RIDING_SHIRT', 'EYEWEAR', 'SUIT', 'OUTDOOR_LIVING',
       'POWERSPORTS_RIDING_JACKET', 'HARDWARE', 'SAFETY_SUPPLY',
       'ABIS_DVD', 'VIDEO_DVD', 'GOLF_CLUB', 'MUSIC_POPULAR_VINYL',
       'HOME_FURNITURE_AND_DECOR', 'TABLET_COMPUTER',

In [26]:
# Top 10 product names with highest frequency
product_type_count=Counter(list(data['product_type_name']))
#print('Top 10 occurring product names:'.format(product_type_count.most_common(10)))
print(product_type_count.most_common(10))

[[('SHIRT', 167794), ('APPAREL', 3549), ('BOOKS_1973_AND_LATER', 3336), ('DRESS', 1584), ('SPORTING_GOODS', 1281), ('SWEATER', 837), ('OUTERWEAR', 796), ('OUTDOOR_RECREATION_PRODUCT', 729), ('ACCESSORY', 636), ('UNDERWEAR', 425)]]


# brand

In [27]:
data['brand'].describe()

count     182987
unique     10577
top         Zago
freq         223
Name: brand, dtype: object

In [32]:
'''183,138-182,987=151 missing values
Out of 10577 unique values most frequent is "Zago"
freq(Zago)=223
%(Zago)=(223/182987)*100 = 0.121 % '''

'183,138-182,987=151 missing values\nOut of 10577 unique values most frequent is "Zago"\nfreq(Zago)=223\n%(Zago)=(223/182987)*100 = 0.121 % '

In [34]:
brand_count=Counter(list(data['brand']))
print(brand_count.most_common(10))

[('Zago', 223), ('XQS', 222), ('Yayun', 215), ('YUNY', 198), ('XiaoTianXin-women clothes', 193), ('Generic', 192), ('Boohoo', 190), ('Alion', 188), ('Abetteric', 187), ('TheMogan', 187)]


# color

In [36]:
data['color'].describe()

count     64956
unique     7380
top       Black
freq      13207
Name: color, dtype: object

In [37]:
'''#7380 unique colors
#missing colors=183138-64956=118182
#7.2% products are black
#35.4% of products have color information'''

'#7380 unique colors\n#missing colors=183138-64956=118182\n#7.2% products are black\n#35.4% of products have color information'

In [38]:
color_count=Counter(list(data['color']))

In [39]:
print(color_count.most_common(10))

[(None, 118182), ('Black', 13207), ('White', 8616), ('Blue', 3570), ('Red', 2289), ('Pink', 1842), ('Grey', 1499), ('*', 1388), ('Green', 1258), ('Multi', 1203)]


In [40]:
'''missing colors=183138-64956=118182=(None, 118182)'''

'missing colors=183138-64956=118182=(None, 118182)'

# formatted_price

In [41]:
data['formatted_price'].describe()

count      28395
unique      3135
top       $19.99
freq         945
Name: formatted_price, dtype: object

In [43]:
'''Only 15.5% of products with price information ie (945/28395) % '''

'Only 15.5% of products with price information ie (945/28395) % '

In [47]:
price_count=Counter(list(data['formatted_price']))
print(price_count.most_common(10))

[(None, 154743), ('$19.99', 945), ('$9.99', 749), ('$9.50', 601), ('$14.99', 472), ('$7.50', 463), ('$24.99', 414), ('$29.99', 370), ('$8.99', 343), ('$9.01', 336)]


# title

In [45]:
print(data['title'].describe())

count                                                183138
unique                                               175985
top       Nakoda Cotton Self Print Straight Kurti For Women
freq                                                     77
Name: title, dtype: object


In [48]:
title_count = Counter(list(data['title']))
print(title_count.most_common(10))

[('Nakoda Cotton Self Print Straight Kurti For Women', 77), ("Q-rious Women's Racerback Cotton Lycra Camsioles", 56), ('FINEJO Casual Women Long Sleeve Lace Irregular Hem Blouse Tops', 47), ('Girlzwalk Women Cami Sleeveless Printed Swing Vest Top Plus Sizes', 44), ("ELINA FASHION Women's Indo-Western Tunic Top Cotton Kurti", 43), ('Victoria Scoop Neck Front Lace Floral High-Low Top in 4 Sizes', 40), ("Cenizas Women's Indian Tunic Top Cotton Kurti", 39), ('Indistar Womens Premium Cotton Half Sleeves Printed T-Shirts/Tops (Pack of 3)', 37), ("Rajnandini Women's Cotton Printed Kurti", 35), ('Long Sleeve Mock Neck Top', 32)]


In [53]:
#We save data files at every major step in our processing in "pickle" files. 
# These pickle files  speed things up.

data.to_pickle('180k_apparel_data')

In [54]:
data.head()

Unnamed: 0,brand,color,medium_image_url,product_type_name,title,formatted_price
0,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Minions Como Superheroes Ironman Long Sleeve R...,
1,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Izo Tunic,
2,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Won Top,
3,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,
4,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,Featherlite Ladies' Long Sleeve Stain Resistan...,$26.26


In [58]:
#ref:https://stackoverflow.com/questions/37543647/how-to-replace-all-non-nan-entries-of-a-dataframe-with-1-and-all-nan-with-0

In [57]:
# We will be considering  products with price information
# data['formatted_price'].isnull() => gives the information 
#about the dataframe row's which have null values price == None|Null
data=data.loc[~data['formatted_price'].isnull()]
print('Data points after eliminating price= NULL: ' , data.shape[0])

Data points after eliminating price= NULL:  28395


In [61]:
# consider products which have color information
# data['color'].isnull() => gives the information about the dataframe row's which have null values price == None|Null
data=data.loc[~data['color'].isnull()]
print('Data points after eliminating color=NULL: ', data.shape[0] )

Data points after eliminating color=NULL:  28385


We brought down the number of data points from 183K to 28K.
