# Imports

In [106]:
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import pandas as pd
import re
import numpy as np

# Data Collection

In [107]:
#creating a useragent
#parameters
header = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}

In [108]:
#H&M site catalog URL
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
#Request to URL
page = requests.get(url, headers = header)
print(page)

# Beautiful soup object
soup = BeautifulSoup( page.text, 'html.parser' )

# ===================== Product Data ============================
products = soup.find( 'ul', class_='products-listing small' )
product_list = products.find_all( 'article', class_='hm-product-item')
                                 
# product id
product_id = [p.get( 'data-articlecode' ) for p in product_list]
                                 
# product category
product_category = [p.get( 'data-category' ) for p in product_list]
                                 
# product name
product_list = products.find_all( 'a', class_='link' )
product_name = [p.get_text() for p in product_list]
                                 
# price
product_list = products.find_all( 'span', class_='price regular' )
product_price = [p.get_text() for p in product_list]
data = pd.DataFrame( [product_id, product_category, product_name, product_price] ).T
                                 
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

<Response [200]>


In [109]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99
1,690449036,men_jeans_ripped,Skinny Jeans,$ 39.99
2,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99
3,690449043,men_jeans_ripped,Skinny Jeans,$ 39.99
4,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99


In [110]:
data.shape

(36, 4)

In [111]:

#empty dataframe
df_compositions = pd.DataFrame()

#unique columns for all products
aux = []

cols = ['Art. No.',
 'Composition',
 'Fit',
 'Product safety',
 'Size',
 'More sustainable materials']

df_pattern = pd.DataFrame(columns = cols)


for i in range(len(data)):  
    #API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
    print('Product: {}'.format(url))
    
    page = requests.get(url, headers = header)


    #BeautifulSoup object
    soup = BeautifulSoup(page.text, 'html.parser')

    product_list = soup.find_all('a', class_ = 'filter-option miniature active') + soup.find_all('a', class_ = 'filter-option miniature')
    color_item = [p.get('data-color') for p in product_list]

    #product id
    product_id = [p.get('data-articlecode') for p in product_list]

    #creating data frame with product id+color name
    df_color = pd.DataFrame((product_id, color_item)).T
    df_color.columns = ['product_id', 'color_name'] #renaming columns
    
    for j in range(len(df_color)):
        
        #API Requests
        url = 'https://www2.hm.com/en_us/productpage.' + df_color.loc[j, 'product_id'] + '.html'
        print('Color: {}'.format(url))
        page = requests.get(url, headers = header)

        #BeautifulSoup object
        soup = BeautifulSoup(page.text, 'html.parser')
        
        ########################  PRODUCT NAME  ###################################
        product_name = soup.find_all('h1', class_ = 'primary product-item-headline')
        product_name = product_name[0].get_text()
        
        ########################  PRODUCT PRICE  ###################################
        product_price = soup.find_all('div', class_ = 'primary-row product-item-price')
        product_price = re.findall(r'\d+\.?\d+', product_price[0].get_text())[0]
        
        #######################  COMPOSITION  ######################################
    
        product_composition_list = soup.find_all('div', class_ = 'pdp-description-list-item')
        product_composition = [list(filter(None, p.get_text().split("\n"))) for p in product_composition_list]

        #renaming labels
        df_composition = pd.DataFrame(product_composition).T
        df_composition.columns = df_composition.iloc[0]

        #deleting first row
        df_composition = df_composition.iloc[1:].fillna(method = 'ffill')

        #remove pocket lining, shell and lining
        df_composition['Composition'] = df_composition['Composition'].str.replace('Pocket: ', '', regex = True)
        df_composition['Composition'] = df_composition['Composition'].str.replace('Pocket lining: ', '', regex = True)
        df_composition['Composition'] = df_composition['Composition'].str.replace('Shell: ', '', regex = True)
        df_composition['Composition'] = df_composition['Composition'].str.replace('Lining: ', '', regex = True)
        
        #guarantee the same number of columns
        df_composition = pd.concat([df_pattern, df_composition], axis = 0)
        
        #rename columns
        df_composition.columns = ['product_id', 'composition', 'fit', 'product_safety', 'size', 'sustainable_materials']
        df_composition['product_name'] = product_name
        df_composition['product_price'] = product_price
            
        #keep new columns if they show up
        aux = aux + df_composition.columns.tolist() 

        #merge df_color and df_composition
        df_composition = pd.merge( df_composition, df_color, how = 'left', on = 'product_id')

        #all products
        df_compositions = pd.concat([df_compositions, df_composition], axis = 0)
        
# join showroom data + details
df_compositions['style_id'] = df_compositions['product_id'].apply(lambda x: x[:-3])
df_compositions['color_id'] = df_compositions['product_id'].apply(lambda x: x[-3:])

df_compositions['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

Product: https://www2.hm.com/en_us/productpage.1024256001.html
Color: https://www2.hm.com/en_us/productpage.1024256001.html
Color: https://www2.hm.com/en_us/productpage.1024256002.html
Color: https://www2.hm.com/en_us/productpage.1024256003.html
Color: https://www2.hm.com/en_us/productpage.1024256004.html
Color: https://www2.hm.com/en_us/productpage.1024256005.html
Color: https://www2.hm.com/en_us/productpage.1024256006.html
Color: https://www2.hm.com/en_us/productpage.1024256007.html
Product: https://www2.hm.com/en_us/productpage.0690449036.html
Color: https://www2.hm.com/en_us/productpage.0690449036.html
Color: https://www2.hm.com/en_us/productpage.0690449001.html
Color: https://www2.hm.com/en_us/productpage.0690449002.html
Color: https://www2.hm.com/en_us/productpage.0690449006.html
Color: https://www2.hm.com/en_us/productpage.0690449007.html
Color: https://www2.hm.com/en_us/productpage.0690449009.html
Color: https://www2.hm.com/en_us/productpage.0690449011.html


IndexError: list index out of range

In [12]:
df_compositions

Unnamed: 0,product_id,composition,fit,product_safety,size,sustainable_materials,product_name,product_price,color_name,style_id,color_id,scrapy_datetime
0,1024256001,"Polyester 65%, Cotton 35%",Slim fit,,"The model is 185cm/6'1"" and wears a size 31/32",,\n\t\t\t\t\t\t\t Slim Jeans,19.99,Black,1024256,001,2022-01-11 11:59:36
1,1024256001,"Cotton 99%, Spandex 1%",Slim fit,,"The model is 185cm/6'1"" and wears a size 31/32",,\n\t\t\t\t\t\t\t Slim Jeans,19.99,Black,1024256,001,2022-01-11 11:59:36
0,1024256002,"Cotton 99%, Spandex 1%",Slim fit,,,,\n\t\t\t\t\t\t\t Slim Jeans,19.99,Light denim blue,1024256,002,2022-01-11 11:59:36
1,1024256002,"Polyester 65%, Cotton 35%",Slim fit,,,,\n\t\t\t\t\t\t\t Slim Jeans,19.99,Light denim blue,1024256,002,2022-01-11 11:59:36
0,1024256003,"Cotton 99%, Spandex 1%",Slim fit,,"The model is 189cm/6'2"" and wears a size 31/32",,\n\t\t\t\t\t\t\t Slim Jeans,19.99,Light denim blue,1024256,003,2022-01-11 11:59:36
...,...,...,...,...,...,...,...,...,...,...,...,...
0,1004476001,"Cotton 90%, Elasterell-P 8%, Spandex 2%",Slim fit,,"The model is 182cm/6'0"" and wears a size 31/32",,\n\t\t\t\t\t\t\t Freefit® Slim Jeans,28.99,Light denim blue,1004476,001,2022-01-11 11:59:36
0,1004476002,"Cotton 90%, Elasterell-P 8%, Spandex 2%",Slim fit,,,,\n\t\t\t\t\t\t\t Freefit® Slim Jeans,29.99,Light denim blue,1004476,002,2022-01-11 11:59:36
0,1004476003,"Cotton 90%, Elasterell-P 8%, Spandex 2%",Slim fit,,,,\n\t\t\t\t\t\t\t Freefit® Slim Jeans,29.99,Denim blue,1004476,003,2022-01-11 11:59:36
0,1004476004,"Cotton 90%, Elasterell-P 8%, Spandex 2%",Slim fit,,"The model is 182cm/6'0"" and wears a size 31/32",,\n\t\t\t\t\t\t\t Freefit® Slim Jeans,28.99,Dark denim blue,1004476,004,2022-01-11 11:59:36


In [103]:
df_compositions.iloc[483,:]

product_id                                                   1008549006
composition                                         Pocket: Cotton 100%
fit                                                         Regular fit
product_safety                                                      NaN
size                     The model is 188cm/6'2" and wears a size 31/30
sustainable_materials                               Recycled cotton 20%
product_name                            \n\t\t\t\t\t\t\t  Regular Jeans
product_price                                                     24.99
color_name                                                        Black
style_id                                                        1008549
color_id                                                            006
scrapy_datetime                                     2022-01-11 11:59:36
Name: 0, dtype: object

In [22]:
df_color

Unnamed: 0,product_id,color_name
0,1024256001,Black
1,1024256003,Light denim blue
2,1024256004,Denim blue
3,1024256005,Dark blue
4,1024256006,Dark denim blue
5,1024256007,Dark gray


# Data Cleaning

In [8]:
df_compositions.shape

(512, 10)

In [9]:
len(df_compositions['product_id'].unique())

123

In [104]:

#product id
#dropping NaN values
df_data = df_compositions.dropna(subset=['product_id'])
#data['product_id'] = data['product_id'].astype(int)

#product_name
#df_data['product_name'] = df_data['product_name'].str.replace('\n', '')
#df_data['product_name'] = df_data['product_name'].str.replace('\t', '')

df_data['product_name'] = df_data['product_name'].str.strip()
df_data['product_name'] = df_data['product_name'].str.replace(' ', '_').str.lower()
df_data['product_name'] = df_data['product_name'].str.replace('®', '').str.lower()


#product_price
df_data['product_price'] = df_data['product_price'].astype(float)
#scrapy time
#data['scrapy_time'] = pd.to_datetime(data['scrapy_time'], format = '%Y-%m-%d %H:%M:%S')

#color name
df_data['color_name'] = df_data['color_name'].str.replace(' ', '_').str.lower()

#fit
df_data['fit'] = df_data['fit'].apply(lambda x: x.replace(' ', '_').lower() if pd.notnull(x) else x)

#size number
df_data['size_number'] = df_data['size'].apply(lambda x: re.search('\d{3}', x).group(0) if pd.notnull(x) else x)
#df_data['size_number'] = df_data['size_number'].apply(lambda x: re.search('\d+', x).group(0) if pd.notnull(x) else x)

#size model
df_data['size_model'] = df_data['size'].str.extract('(\d+/\\d+)')
df_data['size_model'] = df_data['size_model'].apply(lambda x: x.replace('/', '_') if pd.notnull(x) else x)

#drop duplicated cells
#df_data = df_data.drop_duplicates()
#(subset=['product_id', 'product_category', 'product_name', 'product_price', 'scrapy_time', 'style_id', 'color_id', 'color_name', 'Fit', 'Composition', 'size_number', 'size_model'], keep='last' )

#reset index
#df_data = df_data.reset_index(drop = True)

#break composition by comma
df1 = df_data['composition'].str.split(',', expand = True).reset_index(drop = True)


#creating new df with columns
# cotton | polyester | spandex | elasterell
df_ref = pd.DataFrame(index = np.arange(len(df_data)), columns = ['cotton', 'polyester', 'spandex', 'elasterell'])

############################### COMPOSITION #####################################

# -------------- cotton --------------
df_cotton_0 = df1.loc[df1[0].str.contains('Cotton', na = True), 0]
df_cotton_0.name = 'cotton'
df_cotton_1 = df1.loc[df1[1].str.contains('Cotton', na = True), 1]
df_cotton_1.name = 'cotton'

# combine cotton df's
df_cotton = df_cotton_0.combine_first(df_cotton_1)

df_ref = pd.concat([df_ref, df_cotton], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]


#------------ polyester ---------------
df_polyester_0 = df1.loc[df1[0].str.contains('Polyester', na = True), 0]
df_polyester_0.name = 'polyester'
df_polyester_1 = df1.loc[df1[1].str.contains('Polyester', na = True), 1]
df_polyester_1.name = 'polyester'

df_polyester = df_polyester_0.combine_first(df_polyester_1)
df_ref = pd.concat([df_ref, df_polyester], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]

#----------- spandex --------------------
df_spandex_1 = df1.loc[df1[1].str.contains('Spandex', na = True), 1]
df_spandex_1.name = 'spandex'
df_spandex_2 = df1.loc[df1[2].str.contains('Spandex', na = True), 2]
df_spandex_2.name = 'spandex'
df_spandex_3 = df1.loc[df1[3].str.contains('Spandex', na = True), 3]
df_spandex_3.name = 'spandex'

df_spandex_c2 = df_spandex_1.combine_first(df_spandex_2)
df_spandex = df_spandex_c2.combine_first(df_spandex_3)

df_ref = pd.concat([df_ref, df_spandex], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]


# ---------- elasterell --------------------
df_elasterell = df1.loc[df1[1].str.contains('Elasterell', na = True), 1]
df_elasterell.name = 'elasterell'

df_ref = pd.concat([df_ref, df_elasterell], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]

#combine join with product id 
df_aux = pd.concat([df_data['product_id'].reset_index(drop = True), df_ref], axis = 1)

#format composition data
df_data['cotton']     = df_data['cotton'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
df_data['elasterell'] = df_data['elasterell'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
df_data['spandex']    = df_data['spandex'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
df_data['polyester']  = df_data['polyester'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)

#final join
df_aux = df_aux.groupby('product_id').max().reset_index().fillna(0)
df_data = pd.merge([df_data, df_aux], on = 'product_id', how = 'left')

#drop columns
df_data = df_data.drop(columns = ['size', 'product_safety', 'sustainable_materials', 'composition'])

KeyError: 'cotton'

In [93]:
df_ref

Unnamed: 0,cotton,polyester,spandex,elasterell
0,Cotton 35%,Polyester 65%,,
1,Cotton 99%,,Spandex 1%,
2,Cotton 99%,,Spandex 1%,
3,Cotton 35%,Polyester 65%,,
4,Cotton 99%,,Spandex 1%,
...,...,...,...,...
507,Cotton 90%,,Spandex 2%,Elasterell-P 8%
508,Cotton 90%,,Spandex 2%,Elasterell-P 8%
509,Cotton 90%,,Spandex 2%,Elasterell-P 8%
510,Cotton 90%,,Spandex 2%,Elasterell-P 8%


In [94]:
df1[0].unique()

array(['Polyester 65%', 'Cotton 99%', 'Cotton 100%', 'Cotton 98%',
       'Polyester 100%', 'Polyester 63%', 'Cotton 79%', 'Cotton 77%',
       'Cotton 78%', 'Cotton 80%', 'Cotton 93%', 'Cotton 91%',
       'Cotton 72%', 'Pocket: Cotton 100%', 'Cotton 90%'], dtype=object)

In [97]:
df1[df1[0] == 'Pocket: Cotton 100%']

Unnamed: 0,0,1,2,3
483,Pocket: Cotton 100%,,,


In [78]:
df1[1].unique()

array([' Cotton 35%', ' Spandex 1%', None, ' Spandex 2%', ' Cotton 37%',
       ' Polyester 20%', ' Polyester 21%', ' Polyester 19%',
       ' Polyester 6%', ' Polyester 7%', ' Elasterell-P 8%'], dtype=object)

In [79]:
df1[2].unique()

array([None, ' Spandex 1%', ' Spandex 2%', ' Modal 7%'], dtype=object)

In [80]:
df1[3].unique()

array([None, ' Spandex 1%'], dtype=object)