# Imports

In [47]:
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import pandas as pd
import re

# Data Collection

In [48]:
#creating a useragent
#parameters
header = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}

In [50]:
#H&M site catalog URL
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
#Request to URL
page = requests.get(url, headers = header)
print(page)

# Beautiful soup object
soup = BeautifulSoup( page.text, 'html.parser' )

# ===================== Product Data ============================
products = soup.find( 'ul', class_='products-listing small' )
product_list = products.find_all( 'article', class_='hm-product-item')
                                 
# product id
product_id = [p.get( 'data-articlecode' ) for p in product_list]
                                 
# product category
product_category = [p.get( 'data-category' ) for p in product_list]
                                 
# product name
product_list = products.find_all( 'a', class_='link' )
product_name = [p.get_text() for p in product_list]
                                 
# price
product_list = products.find_all( 'span', class_='price regular' )
product_price = [p.get_text() for p in product_list]
data = pd.DataFrame( [product_id, product_category, product_name, product_price] ).T
                                 
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

<Response [200]>


In [51]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99
1,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99
2,690449036,men_jeans_ripped,Skinny Jeans,$ 39.99
3,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99
4,690449043,men_jeans_ripped,Skinny Jeans,$ 39.99


In [52]:
data.shape

(36, 4)

In [59]:

#empty dataframe
df_compositions = pd.DataFrame()

#unique columns for all products
aux = []

cols = ['Art. No.',
 'Composition',
 'Fit',
 'Product safety',
 'Size',
 'More sustainable materials']

df_pattern = pd.DataFrame(columns = cols)


for i in range(len(data)):  
    #API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
    print('Product: {}'.format(url))
    
    page = requests.get(url, headers = header)


    #BeautifulSoup object
    soup = BeautifulSoup(page.text, 'html.parser')

    product_list = soup.find_all('a', class_ = 'filter-option miniature active') + soup.find_all('a', class_ = 'filter-option miniature')
    color_item = [p.get('data-color') for p in product_list]

    #product id
    product_id = [p.get('data-articlecode') for p in product_list]

    #creating data frame with product id+color name
    df_color = pd.DataFrame((product_id, color_item)).T
    df_color.columns = ['product_id', 'color_name'] #renaming columns
    
    for j in range(len(df_color)):
        
        #API Requests
        url = 'https://www2.hm.com/en_us/productpage.' + df_color.loc[j, 'product_id'] + '.html'
        print('Color: {}'.format(url))
        page = requests.get(url, headers = header)

        #BeautifulSoup object
        soup = BeautifulSoup(page.text, 'html.parser')
        
        ########################  PRODUCT NAME  ###################################
        product_name = soup.find_all('h1', class_ = 'primary product-item-headline')
        product_name = product_name[0].get_text()
        
        ########################  PRODUCT PRICE  ###################################
        product_price = soup.find_all('div', class_ = 'primary-row product-item-price')
        product_price = re.findall(r'\d+\.?\d+', product_price[0].get_text())[0]
        
        #######################  COMPOSITION  ######################################
    
        product_composition_list = soup.find_all('div', class_ = 'pdp-description-list-item')
        product_composition = [list(filter(None, p.get_text().split("\n"))) for p in product_composition_list]

        #renaming labels
        df_composition = pd.DataFrame(product_composition).T
        df_composition.columns = df_composition.iloc[0]

        #deleting first row
        df_composition = df_composition.iloc[1:].fillna(method = 'ffill')

        #remove pocket lining, shell and lining
        df_composition['Composition'] = df_composition['Composition'].str.replace('Pocket lining: ', '', regex = True)
        df_composition['Composition'] = df_composition['Composition'].str.replace('Shell: ', '', regex = True)
        df_composition['Composition'] = df_composition['Composition'].str.replace('Lining: ', '', regex = True)
        
        #guarantee the same number of columns
        df_composition = pd.concat([df_pattern, df_composition], axis = 0)
        
        #rename columns
        df_composition.columns = ['product_id', 'composition', 'fit', 'product_safety', 'size', 'sustainable_materials']

        #keep new columns if they show up
        aux = aux + df_composition.columns.tolist() 

        #merge df_color and df_composition
        df_composition = pd.merge( df_composition, df_color, how = 'left', on = 'product_id')

        #all products
        df_compositions = pd.concat([df_compositions, df_composition], axis = 0)
        
# join showroom data + details
df_compositions['style_id'] = df_compositions['product_id'].apply(lambda x: x[:-3])
df_compositions['color_id'] = df_compositions['product_id'].apply(lambda x: x[-3:])

df_compositions['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

Product: https://www2.hm.com/en_us/productpage.1024256001.html
Color: https://www2.hm.com/en_us/productpage.1024256001.html
Color: https://www2.hm.com/en_us/productpage.1024256003.html
Color: https://www2.hm.com/en_us/productpage.1024256004.html
Color: https://www2.hm.com/en_us/productpage.1024256005.html
Color: https://www2.hm.com/en_us/productpage.1024256006.html
Color: https://www2.hm.com/en_us/productpage.1024256007.html
Product: https://www2.hm.com/en_us/productpage.0985159001.html
Color: https://www2.hm.com/en_us/productpage.0985159001.html
Color: https://www2.hm.com/en_us/productpage.0985159002.html
Color: https://www2.hm.com/en_us/productpage.0985159003.html
Color: https://www2.hm.com/en_us/productpage.0985159004.html
Color: https://www2.hm.com/en_us/productpage.0985159005.html
Color: https://www2.hm.com/en_us/productpage.0985159006.html
Color: https://www2.hm.com/en_us/productpage.0985159007.html
Color: https://www2.hm.com/en_us/productpage.0985159008.html
Product: https://www

Color: https://www2.hm.com/en_us/productpage.0875105017.html
Color: https://www2.hm.com/en_us/productpage.0875105018.html
Color: https://www2.hm.com/en_us/productpage.0875105023.html
Product: https://www2.hm.com/en_us/productpage.0875105023.html
Color: https://www2.hm.com/en_us/productpage.0875105023.html
Color: https://www2.hm.com/en_us/productpage.0875105001.html
Color: https://www2.hm.com/en_us/productpage.0875105002.html
Color: https://www2.hm.com/en_us/productpage.0875105003.html
Color: https://www2.hm.com/en_us/productpage.0875105009.html
Color: https://www2.hm.com/en_us/productpage.0875105011.html
Color: https://www2.hm.com/en_us/productpage.0875105015.html
Color: https://www2.hm.com/en_us/productpage.0875105016.html
Color: https://www2.hm.com/en_us/productpage.0875105017.html
Color: https://www2.hm.com/en_us/productpage.0875105018.html
Color: https://www2.hm.com/en_us/productpage.0875105024.html
Product: https://www2.hm.com/en_us/productpage.1004199005.html
Color: https://www2.

Color: https://www2.hm.com/en_us/productpage.1008549008.html
Product: https://www2.hm.com/en_us/productpage.0985197006.html
Color: https://www2.hm.com/en_us/productpage.0985197006.html
Color: https://www2.hm.com/en_us/productpage.0985197001.html
Color: https://www2.hm.com/en_us/productpage.0985197002.html
Color: https://www2.hm.com/en_us/productpage.0985197003.html
Color: https://www2.hm.com/en_us/productpage.0985197004.html
Color: https://www2.hm.com/en_us/productpage.0985197005.html
Color: https://www2.hm.com/en_us/productpage.0985197007.html
Product: https://www2.hm.com/en_us/productpage.0427159006.html
Color: https://www2.hm.com/en_us/productpage.0427159006.html
Color: https://www2.hm.com/en_us/productpage.0427159001.html
Color: https://www2.hm.com/en_us/productpage.0427159002.html
Color: https://www2.hm.com/en_us/productpage.0427159003.html
Color: https://www2.hm.com/en_us/productpage.0427159004.html
Color: https://www2.hm.com/en_us/productpage.0427159005.html
Color: https://www2.

In [61]:
df_compositions

Unnamed: 0,product_id,composition,fit,product_safety,size,sustainable_materials,color_name,style_id,color_id,scrapy_datetime
0,1024256001,"Polyester 65%, Cotton 35%",Slim fit,,"The model is 185cm/6'1"" and wears a size 31/32",,Black,1024256,001,2022-01-10 13:49:33
1,1024256001,"Cotton 99%, Spandex 1%",Slim fit,,"The model is 185cm/6'1"" and wears a size 31/32",,Black,1024256,001,2022-01-10 13:49:33
0,1024256003,"Cotton 99%, Spandex 1%",Slim fit,,"The model is 189cm/6'2"" and wears a size 31/32",,Light denim blue,1024256,003,2022-01-10 13:49:33
1,1024256003,"Polyester 65%, Cotton 35%",Slim fit,,"The model is 189cm/6'2"" and wears a size 31/32",,Light denim blue,1024256,003,2022-01-10 13:49:33
0,1024256004,"Cotton 99%, Spandex 1%",Slim fit,,,,Denim blue,1024256,004,2022-01-10 13:49:33
...,...,...,...,...,...,...,...,...,...,...
0,1013317005,"Cotton 79%, Polyester 19%, Spandex 2%",Regular fit,,"The model is 188cm/6'2"" and wears a size M",Recycled cotton 20%,Dark gray,1013317,005,2022-01-10 13:49:33
1,1013317005,"Cotton 79%, Polyester 19%, Spandex 2%",Regular fit,,"The model is 188cm/6'2"" and wears a size M",Recycled polyester 19%,Dark gray,1013317,005,2022-01-10 13:49:33
0,1013317006,"Cotton 79%, Polyester 20%, Spandex 1%",Regular fit,,"The model is 187cm/6'2"" and wears a size L",,Black,1013317,006,2022-01-10 13:49:33
0,1013317008,"Cotton 77%, Polyester 21%, Spandex 2%",Regular fit,,"The model is 182cm/6'0"" and wears a size M",,Dark blue,1013317,008,2022-01-10 13:49:33


In [22]:
df_color

Unnamed: 0,product_id,color_name
0,1024256001,Black
1,1024256003,Light denim blue
2,1024256004,Denim blue
3,1024256005,Dark blue
4,1024256006,Dark denim blue
5,1024256007,Dark gray


# Data Collection by Product

# Data Cleaning

In [None]:

#product id
#dropping NaN values
data = data.dropna(subset=['product_id'])
data['product_id'] = data['product_id'].astype(int)

#product_name
data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ', '_').replace('®', '').lower())

#product_price
data['product_price'] = data['product_price'].apply(lambda x: x.replace('$', ''))

#scrapy time
data['scrapy_time'] = pd.to_datetime(data['scrapy_time'], format = '%Y-%m-%d %H:%M:%S')

#style id
data['style_id'] = data['style_id'].astype(int)

#color id
data['color_id'] = data['color_id'].astype(int)

#color name
data['color_name'] = data['color_name'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower() if pd.notnull(x) else x)

#fit
data['Fit'] = data['Fit'].apply(lambda x: x.replace(' ', '_').lower() if pd.notnull(x) else x)

#size number
data['size_number'] = data['Size'].apply(lambda x: re.search('\d{3}', x).group(0) if pd.notnull(x) else x)


#size model
data['size_model'] = data['Size'].str.extract('(\d+/\\d+)')
data['size_model'] = data['size_model'].apply(lambda x: x.replace('/', '_') if pd.notnull(x) else x)

#composition
data = data[~data['Composition'].str.contains('Shell:', na = False)]
data = data[~data['Composition'].str.contains('Lining:', na = False)]
data = data[~data['Composition'].str.contains('Pocket lining:', na = False)]

#drop duplicated cells
data = data.drop_duplicates()
#(subset=['product_id', 'product_category', 'product_name', 'product_price', 'scrapy_time', 'style_id', 'color_id', 'color_name', 'Fit', 'Composition', 'size_number', 'size_model'], keep='last' )

#reset index
data = data.reset_index(drop = True)


#splitting composition column to further analysis
df1 = data['Composition'].str.split(',', expand = True)

#creating new df with columns
# cotton | polyester | spandex | elasterell
df_ref = pd.DataFrame(index = np.arange(len(data)), columns = ['cotton', 'polyester', 'spandex', 'elasterell'])

# concating df1 with df_ref on the respective items of df1
# cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat([df_ref, df_cotton], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]


#polyester
df_polyester = df1.loc[df1[1].str.contains('Polyester', na = True), 1]
df_polyester.name = 'polyester'
df_ref = pd.concat([df_ref, df_polyester], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]
df_ref['polyester'] = df_ref['polyester'].fillna('Polyester 0%')

#spandex
df_spandex = df1.loc[df1[1].str.contains('Spandex', na = True), 1]
df_spandex.name = 'spandex'

#combining spandex from columns 1 and 2
df_spandex = df_spandex.combine_first(df1[2])

df_ref = pd.concat([df_ref, df_spandex], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]

#fill NaN values
#df_ref['spandex'] = df_ref['spandex'].fillna('Spandex 0%')

#elasterell
df_elasterell = df1.loc[df1[1].str.contains('Elasterell', na = True), 1]
df_elasterell.name = 'elasterell'
df_ref = pd.concat([df_ref, df_elasterell], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]
df_ref['elasterell'] = df_ref['elasterell'].fillna('Elasterell 0%')

#final join with all characteristics merged
data = pd.concat([data, df_ref], axis = 1)

#format composition data
data['cotton'] = data['cotton'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['elasterell'] = data['elasterell'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['spandex'] = data['spandex'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['polyester'] = data['polyester'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)


#dropping columns
data = data.drop(columns = ['Size', 'Product safety', 'More sustainable materials', 'Composition'], axis = 1)

data = data.drop_duplicates()
