# Regex

In [3]:
import re

In [33]:
text01 = 'Rua H-20, 5 - Parque Shalom, São Luís'
text02 = 'Rua Armando Shibata, 97 - Vila Leopoldina, São Paulo'
text03 = 'Av. dos Trabalhadores, 403 - Centro, Mogi Guaçu'
text04 = 'Al. Moraes, 403 - Vila Golmes, Pernambuco'


text = [text01,text02,text03,text04]
#Nome da rua
#Numero
#Bairro
#Cidade

In [19]:
#nome da rua 
regex = '(.+?),'

for t in text:
    print(re.match(regex, t).group(1))

Rua H-20
Rua Armando Shibata
Av. dos Trabalhadores
Al. Moraes


In [15]:
#numero 
regex = '\d+'

for t in text:
    print(re.search(regex, t).group(0))

20
97
403
403


In [31]:
#bairro
regex = '-( \w+.+),'

for t in text:
    print(re.search(regex, t).group(1))

 Parque Shalom
 Vila Leopoldina
 Centro
 Vila Golmes


In [34]:
#bairro
regex = ',(.[a-zA-Z].+)'

for t in text:
    print(re.search(regex, t).group(1))

 São Luís
 São Paulo
 Mogi Guaçu
 Pernambuco


# Cleaning raw data

In [80]:
from bs4 import BeautifulSoup
import numpy as np
import requests
from datetime import datetime
import pandas as pd
import re

In [50]:
data = pd.read_csv('jeans_data_test.csv', encoding='utf-8')
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_time,style_id,color_id,color_name,Fit,Composition,Size,Product safety,More sustainable materials
0,690449022.0,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-17 13:58:54,690449.0,22.0,Black/trashed,Skinny fit,Lining: Polyester 100%,"The model is 184cm/6'0"" and wears a size 31/32",,
1,690449022.0,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-17 13:58:54,690449.0,22.0,Black/trashed,Skinny fit,"Cotton 98%, Spandex 2%","The model is 184cm/6'0"" and wears a size 31/32",,
2,690449022.0,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-17 13:58:54,690449.0,22.0,Light denim blue/trashed,Skinny fit,Lining: Polyester 100%,"The model is 184cm/6'0"" and wears a size 31/32",,
3,690449022.0,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-17 13:58:54,690449.0,22.0,Light denim blue/trashed,Skinny fit,"Cotton 98%, Spandex 2%","The model is 184cm/6'0"" and wears a size 31/32",,
4,690449022.0,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-17 13:58:54,690449.0,22.0,Denim blue,Skinny fit,Lining: Polyester 100%,"The model is 184cm/6'0"" and wears a size 31/32",,


In [3]:
#checking NaN values
data['product_id'].unique()

array([6.90449022e+08, 6.90449043e+08, 6.90449036e+08, 6.90449051e+08,
       9.85197001e+08, 6.36207011e+08, 7.51994034e+08, 1.00447601e+09,
       9.85159005e+08, 4.27159006e+08, 6.90449056e+08, 9.85159002e+08,
       1.02886500e+09, 9.85159001e+08, 8.11993036e+08,            nan,
       9.85159006e+08, 9.85197006e+08, 7.30863033e+08, 7.30863005e+08,
       1.00419900e+09, 9.38875007e+08, 1.01331700e+09, 7.51994024e+08,
       1.00447600e+09, 9.85197003e+08, 9.85159004e+08, 1.00419900e+09,
       1.00447600e+09, 9.74597006e+08, 1.01331701e+09, 9.85197007e+08,
       1.00419900e+09, 1.01331701e+09, 1.00854900e+09, 1.00447600e+09,
       9.93887004e+08])

In [4]:
data.isna().sum()

product_id                      15
product_category                 0
product_name                    15
product_price                   15
scrapy_time                     15
style_id                        15
color_id                        15
color_name                      15
Fit                             15
Composition                     15
Size                           615
Product safety                1809
More sustainable materials    1047
dtype: int64

In [51]:

#product id
#dropping NaN values
data = data.dropna(subset=['product_id'])
data['product_id'] = data['product_id'].astype(int)

#product_name
data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ', '_').replace('®', '').lower())

#product_price
data['product_price'] = data['product_price'].apply(lambda x: x.replace('$', ''))

#scrapy time
data['scrapy_time'] = pd.to_datetime(data['scrapy_time'], format = '%Y-%m-%d %H:%M:%S')

#style id
data['style_id'] = data['style_id'].astype(int)

#color id
data['color_id'] = data['color_id'].astype(int)

#color name
data['color_name'] = data['color_name'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower() if pd.notnull(x) else x)

#fit
data['Fit'] = data['Fit'].apply(lambda x: x.replace(' ', '_').lower() if pd.notnull(x) else x)

#size number
data['size_number'] = data['Size'].apply(lambda x: re.search('\d{3}', x).group(0) if pd.notnull(x) else x)


#size model
data['size_model'] = data['Size'].str.extract('(\d+/\\d+)')
data['size_model'] = data['size_model'].apply(lambda x: x.replace('/', '_') if pd.notnull(x) else x)

#dropping Size, Product safety, and More sustainable materials columns
data = data.drop(columns = ['Size', 'Product safety', 'More sustainable materials'], axis = 1)




In [52]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_time,style_id,color_id,color_name,Fit,Composition,size_number,size_model
0,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449,22,black_trashed,skinny_fit,Lining: Polyester 100%,184,31_32
1,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449,22,black_trashed,skinny_fit,"Cotton 98%, Spandex 2%",184,31_32
2,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449,22,light_denim_blue_trashed,skinny_fit,Lining: Polyester 100%,184,31_32
3,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449,22,light_denim_blue_trashed,skinny_fit,"Cotton 98%, Spandex 2%",184,31_32
4,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449,22,denim_blue,skinny_fit,Lining: Polyester 100%,184,31_32
...,...,...,...,...,...,...,...,...,...,...,...,...
1819,993887004,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887,4,black,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,
1820,993887004,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887,4,denim_blue,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,
1821,993887004,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887,4,denim_blue,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,
1822,993887004,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887,4,denim_blue,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,


In [46]:
data['Composition'].unique()

array(['Lining: Polyester 100%', 'Cotton 98%, Spandex 2%',
       'Pocket lining: Polyester 65%, Cotton 35%',
       'Pocket lining: Cotton 100%', 'Shell: Cotton 98%, Spandex 2%',
       'Shell: Cotton 99%, Spandex 1%',
       'Cotton 89%, Polyester 10%, Spandex 1%', 'Cotton 99%, Spandex 1%',
       'Shell: Cotton 90%, Elasterell-P 8%, Spandex 2%',
       'Cotton 90%, Elasterell-P 8%, Spandex 2%',
       'Cotton 93%, Polyester 6%, Spandex 1%', 'Shell: Cotton 100%',
       'Cotton 78%, Polyester 21%, Spandex 1%',
       'Cotton 79%, Polyester 20%, Spandex 1%',
       'Cotton 77%, Polyester 21%, Spandex 2%'], dtype=object)

In [73]:
#data['Composition'].unique()
#composition
data= data[~data['Composition'].str.contains('Shell:', na = False)]
data = data[~data['Composition'].str.contains('Lining:', na = False)]
data = data[~data['Composition'].str.contains('Pocket lining:', na = False)]


In [75]:
df1 = data['Composition'].str.split(',', expand = True)

In [78]:
df1[1].unique()

array([' Spandex 2%', ' Polyester 10%', ' Spandex 1%', ' Elasterell-P 8%',
       ' Polyester 6%', ' Polyester 21%', ' Polyester 20%'], dtype=object)

In [85]:
#creating new df with columns
# cotton | polyester | elastane | elasterell
f_ref = pd.DataFrame(index = np.arange(len(data)), columns = ['cotton', 'polyester', 'elastane', 'elasterell'])

# concating df1 with df_ref on the respective items of df1
# cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat([df_ref, df_cotton], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]

#polyester

In [86]:
df_ref

Unnamed: 0,polyester,elastane,elasterell,cotton
0,,,,
1,,,,Cotton 98%
2,,,,
3,,,,Cotton 98%
4,,,,
...,...,...,...,...
1819,,,,Cotton 79%
1820,,,,Cotton 79%
1821,,,,Cotton 79%
1822,,,,Cotton 79%


In [25]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_time,style_id,color_id,color_name,Fit,Composition,size_number,size_model
1,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449,22,black_trashed,skinny_fit,"Cotton 98%, Spandex 2%",184,31_32
3,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449,22,light_denim_blue_trashed,skinny_fit,"Cotton 98%, Spandex 2%",184,31_32
5,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449,22,denim_blue,skinny_fit,"Cotton 98%, Spandex 2%",184,31_32
7,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449,22,black_washed,skinny_fit,"Cotton 98%, Spandex 2%",184,31_32
9,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449,22,light_denim_blue,skinny_fit,"Cotton 98%, Spandex 2%",184,31_32
...,...,...,...,...,...,...,...,...,...,...,...,...
1819,993887004,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887,4,black,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,
1820,993887004,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887,4,denim_blue,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,
1821,993887004,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887,4,denim_blue,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,
1822,993887004,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887,4,denim_blue,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,
