# Webscrapping 1

In [173]:
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests
from datetime import datetime
import pandas as pd

In [183]:
#creating a useragent
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}

In [184]:
#H&M site catalog
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
# id # product_name # product_type # product_color # price
page = requests.get(url, headers = header)
print(page)

<Response [200]>


In [185]:
soup = BeautifulSoup(page.text, 'html.parser')

products = soup.find('ul', class_= 'products-listing small')

product_list = products.find_all('article', class_ = 'hm-product-item')

product_id = [p.get('data-articlecode') for p in product_list]

product_category = [p.get('data-category') for p in product_list]

product_list = products.find_all('a', class_ = 'link')
product_name = [p.get_text() for p in product_list]

product_list = products.find_all('span', class_ = 'price regular')
product_price = [p.get_text() for p in product_list]

data = pd.DataFrame([product_id, product_category, product_name, product_price]).T  #transposição de coluna para linha
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']
data['scrapy_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')


data = data.iloc[:32]
#data.drop(data.index[34:36], axis = 0, inplace = True) #temporary because of site problem requesting the other rows
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_time
0,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-12-12 20:54:19
1,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-12 20:54:19
2,690449043,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-12 20:54:19
3,751994034,men_jeans_slim,Slim Jeans,$ 29.99,2021-12-12 20:54:19
4,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-12 20:54:19
5,730863033,men_jeans_skinny,Skinny Jeans,$ 29.99,2021-12-12 20:54:19
6,751994031,men_jeans_slim,Slim Jeans,$ 29.99,2021-12-12 20:54:19
7,985197004,men_jeans_slim,Slim Jeans,$ 19.99,2021-12-12 20:54:19
8,985159006,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-12-12 20:54:19
9,811993036,men_jeans_regular,Regular Jeans,$ 29.99,2021-12-12 20:54:19


# Webscrapping 2

In [37]:
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests
from datetime import datetime
import pandas as pd
import numpy as np

In [38]:
#creating a useragent
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.text, 'html.parser')

In [39]:
total_items = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')

In [40]:
int(total_items)

62

In [43]:
#how many pages to get all data?
page_number = np.round(int(total_items)/36)
page_number

2.0

In [44]:
url2 = url + '?page-size' + str(int(page_number)*36)
url2

'https://www2.hm.com/en_us/men/products/jeans.html?page-size72'

# Webscrapping 3

In [90]:
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests
from datetime import datetime
import pandas as pd
import numpy as np

In [91]:
#one single product test

In [123]:
#creating a useragent
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}
#API Request
url = 'https://www2.hm.com/en_us/productpage.0985197001.html'
page = requests.get(url, headers = header)
#BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')

In [93]:
########################   color name   ####################################
#including first prod (class filter-option miniature active)

product_list = soup.find_all('a', class_ = 'filter-option miniature active') + soup.find_all('a', class_ = 'filter-option miniature')
color_item = [p.get('data-color') for p in product_list]

#product id
product_id = [p.get('data-articlecode') for p in product_list]

#creating data frame with product id+color name
df_color = pd.DataFrame((product_id, color_item)).T
df_color.columns = ['product_id', 'color_name'] #renaming columns

#generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

#####################     composition     #################################3
product_composition_list = soup.find_all('div', class_ = 'pdp-description-list-item')

product_composition = [list(filter(None, p.get_text().split("\n"))) for p in product_composition_list]

df_composition = pd.DataFrame(product_composition).T

#renaming labels
df_composition.columns = df_composition.iloc[0]

#deleting first row
df_composition = df_composition.iloc[1:].fillna(method = 'ffill')

#generate style id +color id

df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

#merge df_color and df_composition
df_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how = 'left', on = 'style_id')

In [96]:
df_sku

Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition
0,985197001,Black,985197,1,Slim fit,Pocket lining: Cotton 100%
1,985197001,Black,985197,1,Slim fit,"Shell: Cotton 98%, Spandex 2%"
2,985197002,Midnight blue,985197,2,Slim fit,Pocket lining: Cotton 100%
3,985197002,Midnight blue,985197,2,Slim fit,"Shell: Cotton 98%, Spandex 2%"
4,985197003,Denim blue,985197,3,Slim fit,Pocket lining: Cotton 100%
5,985197003,Denim blue,985197,3,Slim fit,"Shell: Cotton 98%, Spandex 2%"
6,985197004,Dark denim blue,985197,4,Slim fit,Pocket lining: Cotton 100%
7,985197004,Dark denim blue,985197,4,Slim fit,"Shell: Cotton 98%, Spandex 2%"
8,985197005,Dark denim blue,985197,5,Slim fit,Pocket lining: Cotton 100%
9,985197005,Dark denim blue,985197,5,Slim fit,"Shell: Cotton 98%, Spandex 2%"


# Webscrapping 4

In [70]:
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests
from datetime import datetime
import pandas as pd
import numpy as np

In [190]:
#creating a useragent
#ua = UserAgent()
#header = {'User-Agent':str(ua.chrome)}

#empty dataframe
df_details = pd.DataFrame()

#unique columns for all products
aux = []

cols = ['Art. No.',
 'Composition',
 'Fit',
 'More sustainable materials',
 'Product safety',
 'Size']

df_pattern = pd.DataFrame(columns = cols)


for i in range(len(data)):  #an item has had a prob in loading
    #API Request
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
    page = requests.get(url, headers = header)
    #print(page)
    
    #BeautifulSoup object
    soup = BeautifulSoup(page.text, 'html.parser')
    
    
    ########################   color name   ####################################
    #including first prod (class filter-option miniature active)

    product_list = soup.find_all('a', class_ = 'filter-option miniature active') + soup.find_all('a', class_ = 'filter-option miniature')
    color_item = [p.get('data-color') for p in product_list]

    #product id
    product_id = [p.get('data-articlecode') for p in product_list]

    #creating data frame with product id+color name
    df_color = pd.DataFrame((product_id, color_item)).T
    df_color.columns = ['product_id', 'color_name'] #renaming columns

    #generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

    #####################     composition     #################################3
    product_composition_list = soup.find_all('div', class_ = 'pdp-description-list-item')

    product_composition = [list(filter(None, p.get_text().split("\n"))) for p in product_composition_list]

    df_composition = pd.DataFrame(product_composition).T

    #renaming labels
    df_composition.columns = df_composition.iloc[0]

    #deleting first row
    df_composition = df_composition.iloc[1:].fillna(method = 'ffill')

    #guarantee the same number of columns
    df_composition = pd.concat([df_pattern, df_composition], axis = 0)
    
    #generate style id +color id

    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

    
    aux = aux + df_composition.columns.tolist() 
    
    #merge df_color and df_composition
    df_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition', 'Size', 'Product safety', 'More sustainable materials']], how = 'left', on = 'style_id')
    
    df_details = pd.concat([df_details, df_sku], axis = 0)

In [191]:
df_details

Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition,Size,Product safety,More sustainable materials
0,0985197001,Black,0985197,001,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",,
1,0985197001,Black,0985197,001,Slim fit,"Shell: Cotton 98%, Spandex 2%","The model is 189cm/6'2"" and wears a size 32/32",,
2,0985197002,Midnight blue,0985197,002,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",,
3,0985197002,Midnight blue,0985197,002,Slim fit,"Shell: Cotton 98%, Spandex 2%","The model is 189cm/6'2"" and wears a size 32/32",,
4,0985197003,Denim blue,0985197,003,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",,
...,...,...,...,...,...,...,...,...,...
1,1004476001,Light denim blue,1004476,001,Slim fit,"Cotton 90%, Elasterell-P 8%, Spandex 2%",,,
2,1004476003,Denim blue,1004476,003,Slim fit,"Cotton 90%, Elasterell-P 8%, Spandex 2%",,,
3,1004476004,Dark denim blue,1004476,004,Slim fit,"Cotton 90%, Elasterell-P 8%, Spandex 2%",,,
4,1004476005,Black/No fade black,1004476,005,Slim fit,"Cotton 90%, Elasterell-P 8%, Spandex 2%",,,


In [192]:
# join showroom data + details

data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data['color_id'] = data['product_id'].apply(lambda x: x[-3:])
data


Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_time,style_id,color_id
0,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-12-12 20:54:19,985197,1
1,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-12 20:54:19,690449,51
2,690449043,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-12 20:54:19,690449,43
3,751994034,men_jeans_slim,Slim Jeans,$ 29.99,2021-12-12 20:54:19,751994,34
4,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-12 20:54:19,690449,22
5,730863033,men_jeans_skinny,Skinny Jeans,$ 29.99,2021-12-12 20:54:19,730863,33
6,751994031,men_jeans_slim,Slim Jeans,$ 29.99,2021-12-12 20:54:19,751994,31
7,985197004,men_jeans_slim,Slim Jeans,$ 19.99,2021-12-12 20:54:19,985197,4
8,985159006,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-12-12 20:54:19,985159,6
9,811993036,men_jeans_regular,Regular Jeans,$ 29.99,2021-12-12 20:54:19,811993,36


In [199]:
data_raw = pd.merge(data, df_details[['style_id', 'color_name', 'Fit', 'Composition', 'Size', 'Product safety', 'More sustainable materials']], how = 'left', on = 'style_id')

# Testing

In [18]:


#creating a useragent
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}

#empty dataframe
df_details = pd.DataFrame()

#unique columns for all products
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'More sustainable materials', 'Size']
df_pattern = pd.DataFrame(columns = cols)


for i in range(len(data)):
    #API Request
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i,'product_id'] + '.html'
    page = requests.get(url, headers = header)

    #BeautifulSoup object
    soup = BeautifulSoup(page.text, 'html.parser')

    ########################   color name   ####################################
    #including first prod (class filter-option miniature active)

    product_list = soup.find_all('a', class_ = 'filter-option miniature active') + soup.find_all('a', class_ = 'filter-option miniature')
    color_item = [p.get('data-color') for p in product_list]

    #product id
    product_id = [p.get('data-articlecode') for p in product_list]

    #creating data frame with product id+color name
    df_color = pd.DataFrame((product_id, color_item)).T
    df_color.columns = ['product_id', 'color_name'] #renaming columns

    #generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

    #####################     composition     #################################3
    product_composition_list = soup.find_all('div', class_ = 'pdp-description-list-item')

    product_composition = [list(filter(None, p.get_text().split("\n"))) for p in product_composition_list]

    df_composition = pd.DataFrame(product_composition).T

    #renaming labels
    df_composition.columns = df_composition.iloc[0]

    #deleting first row
    df_composition = df_composition.iloc[1:].fillna(method = 'ffill')

    #guarantee the same number of columns
    df_composition = pd.concat([df_pattern, df_composition], axis = 0)
    
    #generate style id +color id

    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

    aux = aux + df_composition.columns.tolist()
    
    #merge df_color and df_composition
    df_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition', 'Product safety', 'Size']], how = 'left', on = 'style_id')
    
    #all details products
    df_details = pd.concat([df_details, df_sku], axis = 0)

IndexError: single positional indexer is out-of-bounds

In [83]:
df_details.head()

Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition,Size,Product safety
0,985197001,Black,985197,1,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",
1,985197001,Black,985197,1,Slim fit,"Shell: Cotton 98%, Spandex 2%","The model is 189cm/6'2"" and wears a size 32/32",
2,985197002,Midnight blue,985197,2,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",
3,985197002,Midnight blue,985197,2,Slim fit,"Shell: Cotton 98%, Spandex 2%","The model is 189cm/6'2"" and wears a size 32/32",
4,985197003,Denim blue,985197,3,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",


In [57]:
df_pattern

Unnamed: 0,Art. No.,Composition,Fit,Product safety,More sustainable materials,Size
