In [1]:
import warnings 
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd 
import scipy
import matplotlib
import matplotlib.pyplot as plt
import nltk
import math
import time
import re 
import os 
import seaborn as sns
import plotly
import plotly.figure_factory as ff
import requests

from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import gridspec
from scipy.sparse import hstack
from plotly.graph_objs import Layout, Scatter
plotly.offline.init_notebook_mode(connected = True)

In [2]:
data = pd.read_json("../tops_fashion.json")
print("Number of Data-points in the given dataset is : ", data.shape[0])
print("Number of Features in the given dataset is :", data.shape[1])

Number of Data-points in the given dataset is :  183138
Number of Features in the given dataset is : 19


In [3]:
data.columns

Index(['asin', 'author', 'availability', 'availability_type', 'brand', 'color',
       'editorial_reivew', 'editorial_review', 'formatted_price',
       'large_image_url', 'manufacturer', 'medium_image_url', 'model',
       'product_type_name', 'publisher', 'reviews', 'sku', 'small_image_url',
       'title'],
      dtype='object')

In [4]:
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]

In [5]:
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Minions Como Superheroes Ironman Long Sleeve R...,
1,B01N49AI08,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Izo Tunic,
2,B01JDPCOHO,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Won Top,
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,Featherlite Ladies' Long Sleeve Stain Resistan...,$26.26


In [6]:
data["product_type_name"].describe()

count     183138
unique        72
top        SHIRT
freq      167794
Name: product_type_name, dtype: object

In [7]:
data["product_type_name"].unique()

array(['SHIRT', 'SWEATER', 'APPAREL', 'OUTDOOR_RECREATION_PRODUCT',
       'BOOKS_1973_AND_LATER', 'PANTS', 'HAT', 'SPORTING_GOODS', 'DRESS',
       'UNDERWEAR', 'SKIRT', 'OUTERWEAR', 'BRA', 'ACCESSORY',
       'ART_SUPPLIES', 'SLEEPWEAR', 'ORCA_SHIRT', 'HANDBAG',
       'PET_SUPPLIES', 'SHOES', 'KITCHEN', 'ADULT_COSTUME',
       'HOME_BED_AND_BATH', 'MISC_OTHER', 'BLAZER',
       'HEALTH_PERSONAL_CARE', 'TOYS_AND_GAMES', 'SWIMWEAR',
       'CONSUMER_ELECTRONICS', 'SHORTS', 'HOME', 'AUTO_PART',
       'OFFICE_PRODUCTS', 'ETHNIC_WEAR', 'BEAUTY',
       'INSTRUMENT_PARTS_AND_ACCESSORIES', 'POWERSPORTS_PROTECTIVE_GEAR',
       'SHIRTS', 'ABIS_APPAREL', 'AUTO_ACCESSORY', 'NONAPPARELMISC',
       'TOOLS', 'BABY_PRODUCT', 'SOCKSHOSIERY',
       'POWERSPORTS_RIDING_SHIRT', 'EYEWEAR', 'SUIT', 'OUTDOOR_LIVING',
       'POWERSPORTS_RIDING_JACKET', 'HARDWARE', 'SAFETY_SUPPLY',
       'ABIS_DVD', 'VIDEO_DVD', 'GOLF_CLUB', 'MUSIC_POPULAR_VINYL',
       'HOME_FURNITURE_AND_DECOR', 'TABLET_COMPUTER',

In [8]:
product_type_count = Counter(data["product_type_name"])
product_type_count.most_common(10)

[('SHIRT', 167794),
 ('APPAREL', 3549),
 ('BOOKS_1973_AND_LATER', 3336),
 ('DRESS', 1584),
 ('SPORTING_GOODS', 1281),
 ('SWEATER', 837),
 ('OUTERWEAR', 796),
 ('OUTDOOR_RECREATION_PRODUCT', 729),
 ('ACCESSORY', 636),
 ('UNDERWEAR', 425)]

In [10]:
data["product_type_name"].isnull().sum()

0

In [11]:
data["brand"].describe()

count     182987
unique     10577
top         Zago
freq         223
Name: brand, dtype: object

In [12]:
brand_count = Counter(data["brand"])
brand_count.most_common(10)

[('Zago', 223),
 ('XQS', 222),
 ('Yayun', 215),
 ('YUNY', 198),
 ('XiaoTianXin-women clothes', 193),
 ('Generic', 192),
 ('Boohoo', 190),
 ('Alion', 188),
 ('Abetteric', 187),
 ('TheMogan', 187)]

In [11]:
data["brand"].isnull().sum()

151

In [13]:
data["color"].describe()

count     64956
unique     7380
top       Black
freq      13207
Name: color, dtype: object

In [14]:
color_count = Counter(data["color"])
color_count.most_common(10)

[(None, 118182),
 ('Black', 13207),
 ('White', 8616),
 ('Blue', 3570),
 ('Red', 2289),
 ('Pink', 1842),
 ('Grey', 1499),
 ('*', 1388),
 ('Green', 1258),
 ('Multi', 1203)]

In [12]:
data["color"].isnull().sum()

118182

In [15]:
data["formatted_price"].describe()

count      28395
unique      3135
top       $19.99
freq         945
Name: formatted_price, dtype: object

In [16]:
price_count = Counter(data["formatted_price"])
price_count.most_common(10)

[(None, 154743),
 ('$19.99', 945),
 ('$9.99', 749),
 ('$9.50', 601),
 ('$14.99', 472),
 ('$7.50', 463),
 ('$24.99', 414),
 ('$29.99', 370),
 ('$8.99', 343),
 ('$9.01', 336)]

In [13]:
data["formatted_price"].isnull().sum()

154743

In [17]:
data["title"].describe()

count                                                183138
unique                                               175985
top       Nakoda Cotton Self Print Straight Kurti For Women
freq                                                     77
Name: title, dtype: object

In [14]:
data["title"].isnull().sum()

0

In [15]:
data["asin"].isnull().sum()

0

In [16]:
data["medium_image_url"].isnull().sum()

0

In [17]:
data.fillna('Unknown', inplace = True)

In [18]:
data.isnull().any()

asin                 False
brand                False
color                False
medium_image_url     False
product_type_name    False
title                False
formatted_price      False
dtype: bool

In [19]:
data.to_pickle("../180k_apparel_data")

In [91]:
import multiprocessing.dummy as mp 

def download_img(num):
    print(num)
    url = data.iloc[num]['medium_image_url']
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img.save('../image'+ data.iloc[num]['asin'] +'.jpeg')  

if __name__ == "__main__":
    
    p = mp.Pool(8)
    p.map(download_img, np.array([78412, 78413, 78414, 78415]))
    p.close()
    p.join()

78412
78413
78414
78415


OSError: cannot identify image file <_io.BytesIO object at 0x7f1e7877b9e8>