In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

  import pandas.util.testing as tm


In [2]:
#Here we are loading our json file using pandas read_json 
data = pd.read_json('tops_fashion.json')

In [3]:
#data.head()

In [4]:
#Print the number of products in our dataset and number of features
print(f'Number of products: {data.shape[0]}\n\
Number of Features: {data.shape[1]}')

Number of products: 183138
Number of Features: 19


In [5]:
#Features/Column that we have in our dataset
data.columns

Index(['sku', 'asin', 'product_type_name', 'formatted_price', 'author',
       'color', 'brand', 'publisher', 'availability', 'reviews',
       'large_image_url', 'availability_type', 'small_image_url',
       'editorial_review', 'title', 'model', 'medium_image_url',
       'manufacturer', 'editorial_reivew'],
      dtype='object')

We are not going to use all of the features as of now. We are using only 6 features out of 19.
1. asin  ( Amazon standard identification number)
2. brand ( brand to which the product belongs to )
3. color ( Color information of apparel, it can contain many colors as   a value ex: red and black stripes) 
4. product_type_name (type of the apperal, ex: SHIRT/TSHIRT )
5. medium_image_url  ( url of the image )
6. title (title of the product.)
7. formatted_price (price of the product)

In [6]:
#As for now we do not need the whole features so we are re assigning the new extracted to "data" variable
data = data[['asin','product_type_name','color','brand','title','medium_image_url','formatted_price']]

In [7]:
data.columns

Index(['asin', 'product_type_name', 'color', 'brand', 'title',
       'medium_image_url', 'formatted_price'],
      dtype='object')

In [8]:
data.head()

Unnamed: 0,asin,product_type_name,color,brand,title,medium_image_url,formatted_price
0,B016I2TS4W,SHIRT,,FNC7C,Minions Como Superheroes Ironman Long Sleeve R...,https://images-na.ssl-images-amazon.com/images...,
1,B01N49AI08,SHIRT,,FIG Clothing,FIG Clothing Womens Izo Tunic,https://images-na.ssl-images-amazon.com/images...,
2,B01JDPCOHO,SHIRT,,FIG Clothing,FIG Clothing Womens Won Top,https://images-na.ssl-images-amazon.com/images...,
3,B01N19U5H5,SHIRT,,Focal18,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,https://images-na.ssl-images-amazon.com/images...,
4,B004GSI2OS,SHIRT,Onyx Black/ Stone,FeatherLite,Featherlite Ladies' Long Sleeve Stain Resistan...,https://images-na.ssl-images-amazon.com/images...,$26.26


#### Statistics about Product type

In [9]:
#Statistics about the features that we are going to use.
rows_product_type = data['product_type_name'].describe()[0]
data['product_type_name'].describe()

count     183138
unique        72
top        SHIRT
freq      167794
Name: product_type_name, dtype: object

In [10]:
#Null values in "product_type_name" feature it is very important to recommend the user.
print(f'Number of Null Values in Product type: {data.shape[0] - rows_product_type}')

Number of Null Values in Product type: 0


As form above statistics there is no null value in the Product type. The most frequent product type is "SHIRT".
As we are dealing with the womens top. And, there are 72 unique product description.

In [11]:
#Below are the different product types that are available in our dataset.
data['product_type_name'].unique()

array(['SHIRT', 'SWEATER', 'APPAREL', 'OUTDOOR_RECREATION_PRODUCT',
       'BOOKS_1973_AND_LATER', 'PANTS', 'HAT', 'SPORTING_GOODS', 'DRESS',
       'UNDERWEAR', 'SKIRT', 'OUTERWEAR', 'BRA', 'ACCESSORY',
       'ART_SUPPLIES', 'SLEEPWEAR', 'ORCA_SHIRT', 'HANDBAG',
       'PET_SUPPLIES', 'SHOES', 'KITCHEN', 'ADULT_COSTUME',
       'HOME_BED_AND_BATH', 'MISC_OTHER', 'BLAZER',
       'HEALTH_PERSONAL_CARE', 'TOYS_AND_GAMES', 'SWIMWEAR',
       'CONSUMER_ELECTRONICS', 'SHORTS', 'HOME', 'AUTO_PART',
       'OFFICE_PRODUCTS', 'ETHNIC_WEAR', 'BEAUTY',
       'INSTRUMENT_PARTS_AND_ACCESSORIES', 'POWERSPORTS_PROTECTIVE_GEAR',
       'SHIRTS', 'ABIS_APPAREL', 'AUTO_ACCESSORY', 'NONAPPARELMISC',
       'TOOLS', 'BABY_PRODUCT', 'SOCKSHOSIERY',
       'POWERSPORTS_RIDING_SHIRT', 'EYEWEAR', 'SUIT', 'OUTDOOR_LIVING',
       'POWERSPORTS_RIDING_JACKET', 'HARDWARE', 'SAFETY_SUPPLY',
       'ABIS_DVD', 'VIDEO_DVD', 'GOLF_CLUB', 'MUSIC_POPULAR_VINYL',
       'HOME_FURNITURE_AND_DECOR', 'TABLET_COMPUTER',

In [12]:
#We aill see some most common product types
most_frequest_product = Counter(list(data['product_type_name']))
most_frequest_product.most_common(10)

[('SHIRT', 167794),
 ('APPAREL', 3549),
 ('BOOKS_1973_AND_LATER', 3336),
 ('DRESS', 1584),
 ('SPORTING_GOODS', 1281),
 ('SWEATER', 837),
 ('OUTERWEAR', 796),
 ('OUTDOOR_RECREATION_PRODUCT', 729),
 ('ACCESSORY', 636),
 ('UNDERWEAR', 425)]

In [13]:
frequent_items = most_frequest_product.most_common(10)
total_rows=data.shape[0]
re = list(map(lambda x: round(x[1]/total_rows*100,2), frequent_items))
items = list(map(lambda x : x[0], frequent_items))

In [14]:
#Products along with their % in dataset
res = np.concatenate((np.array(items).reshape(-1,1), np.array(re).reshape(-1,1)), axis=1)
print(res)

[['SHIRT' '91.62']
 ['APPAREL' '1.94']
 ['BOOKS_1973_AND_LATER' '1.82']
 ['DRESS' '0.86']
 ['SPORTING_GOODS' '0.7']
 ['SWEATER' '0.46']
 ['OUTERWEAR' '0.43']
 ['OUTDOOR_RECREATION_PRODUCT' '0.4']
 ['ACCESSORY' '0.35']
 ['UNDERWEAR' '0.23']]


#### Stastistics about brand

In [15]:
rows_brand = data['brand'].describe()[0]
data['brand'].describe()

count     182987
unique     10577
top         Zago
freq         223
Name: brand, dtype: object

In [16]:
#Number of Null values in "brand" and some most frequent brands.
print(f'Number of Null values in Brand: {total_rows - rows_brand}')

Number of Null values in Brand: 151


In [17]:
#Number of unique brands
len(data['brand'].unique())

10578

In [18]:
#Most frequent brands
frequent_brands = Counter(list(data['brand']))
frequent_brands.most_common(10)

[('Zago', 223),
 ('XQS', 222),
 ('Yayun', 215),
 ('YUNY', 198),
 ('XiaoTianXin-women clothes', 193),
 ('Generic', 192),
 ('Boohoo', 190),
 ('Alion', 188),
 ('Abetteric', 187),
 ('TheMogan', 187)]

#### Statistics about the color

In [19]:
row_color = data['color'].describe()[0]
data['color'].describe()

count     64956
unique     7380
top       Black
freq      13207
Name: color, dtype: object

In [20]:
#Number of Null values in Color and unique colors that are present in dataset
print(f'Number of Null values in Color: {total_rows - row_color}')
print(f'% of Null values in Color column: {(total_rows - row_color)/total_rows*100:.2f}')

Number of Null values in Color: 118182
% of Null values in Color column: 64.53


In [21]:
#Most common Color
common_color = Counter(list(data['color']))
common_color.most_common(10)

[(None, 118182),
 ('Black', 13207),
 ('White', 8616),
 ('Blue', 3570),
 ('Red', 2289),
 ('Pink', 1842),
 ('Grey', 1499),
 ('*', 1388),
 ('Green', 1258),
 ('Multi', 1203)]

In [22]:
#None is the most common because of the Null values in Color Column.

#### Statistics for the title Feature 

In [23]:
data['title'].describe()

count                                                183138
unique                                               175985
top       Nakoda Cotton Self Print Straight Kurti For Women
freq                                                     77
Name: title, dtype: object

In [24]:
#some most common title of the products. There are total 175985 products.
common_color = Counter(list(data['title']))
common_color.most_common(10)

[('Nakoda Cotton Self Print Straight Kurti For Women', 77),
 ("Q-rious Women's Racerback Cotton Lycra Camsioles", 56),
 ('FINEJO Casual Women Long Sleeve Lace Irregular Hem Blouse Tops', 47),
 ('Girlzwalk Women Cami Sleeveless Printed Swing Vest Top Plus Sizes', 44),
 ("ELINA FASHION Women's Indo-Western Tunic Top Cotton Kurti", 43),
 ('Victoria Scoop Neck Front Lace Floral High-Low Top in 4 Sizes', 40),
 ("Cenizas Women's Indian Tunic Top Cotton Kurti", 39),
 ('Indistar Womens Premium Cotton Half Sleeves Printed T-Shirts/Tops (Pack of 3)',
  37),
 ("Rajnandini Women's Cotton Printed Kurti", 35),
 ('Long Sleeve Mock Neck Top', 32)]

#### Statistics about the formatted price

In [25]:
rows_price = data['formatted_price'].describe()[0]
data['formatted_price'].describe()

count      28395
unique      3135
top       $19.99
freq         945
Name: formatted_price, dtype: object

In [26]:
#Null values %
print(f'% of Null Price point: {(total_rows - rows_price)/total_rows*100:.2f}')

% of Null Price point: 84.50


In [27]:
price_freq = Counter(list(data['formatted_price']))
price_freq.most_common(10)

[(None, 154743),
 ('$19.99', 945),
 ('$9.99', 749),
 ('$9.50', 601),
 ('$14.99', 472),
 ('$7.50', 463),
 ('$24.99', 414),
 ('$29.99', 370),
 ('$8.99', 343),
 ('$9.01', 336)]

In [29]:
#For convenience we are going to store the 180k data points.
data.to_pickle('dataset/extracted_data_features')