# Regex

In [3]:
import re

In [33]:
text01 = 'Rua H-20, 5 - Parque Shalom, São Luís'
text02 = 'Rua Armando Shibata, 97 - Vila Leopoldina, São Paulo'
text03 = 'Av. dos Trabalhadores, 403 - Centro, Mogi Guaçu'
text04 = 'Al. Moraes, 403 - Vila Golmes, Pernambuco'


text = [text01,text02,text03,text04]
#Nome da rua
#Numero
#Bairro
#Cidade

In [19]:
#nome da rua 
regex = '(.+?),'

for t in text:
    print(re.match(regex, t).group(1))

Rua H-20
Rua Armando Shibata
Av. dos Trabalhadores
Al. Moraes


In [15]:
#numero 
regex = '\d+'

for t in text:
    print(re.search(regex, t).group(0))

20
97
403
403


In [31]:
#bairro
regex = '-( \w+.+),'

for t in text:
    print(re.search(regex, t).group(1))

 Parque Shalom
 Vila Leopoldina
 Centro
 Vila Golmes


In [34]:
#bairro
regex = ',(.[a-zA-Z].+)'

for t in text:
    print(re.search(regex, t).group(1))

 São Luís
 São Paulo
 Mogi Guaçu
 Pernambuco


# Cleaning raw data

In [63]:
import numpy as np
import requests
from datetime import datetime
import pandas as pd
import re

In [64]:
data = pd.read_csv('jeans_data_test.csv', encoding='utf-8')
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_time,style_id,color_id,color_name,Fit,Composition,Size,Product safety,More sustainable materials
0,690449022.0,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-17 13:58:54,690449.0,22.0,Black/trashed,Skinny fit,Lining: Polyester 100%,"The model is 184cm/6'0"" and wears a size 31/32",,
1,690449022.0,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-17 13:58:54,690449.0,22.0,Black/trashed,Skinny fit,"Cotton 98%, Spandex 2%","The model is 184cm/6'0"" and wears a size 31/32",,
2,690449022.0,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-17 13:58:54,690449.0,22.0,Light denim blue/trashed,Skinny fit,Lining: Polyester 100%,"The model is 184cm/6'0"" and wears a size 31/32",,
3,690449022.0,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-17 13:58:54,690449.0,22.0,Light denim blue/trashed,Skinny fit,"Cotton 98%, Spandex 2%","The model is 184cm/6'0"" and wears a size 31/32",,
4,690449022.0,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-12-17 13:58:54,690449.0,22.0,Denim blue,Skinny fit,Lining: Polyester 100%,"The model is 184cm/6'0"" and wears a size 31/32",,


In [65]:

#product id
#dropping NaN values
data = data.dropna(subset=['product_id'])
data['product_id'] = data['product_id'].astype(int)

#product_name
data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ', '_').replace('®', '').lower())

#product_price
data['product_price'] = data['product_price'].apply(lambda x: x.replace('$', ''))

#scrapy time
data['scrapy_time'] = pd.to_datetime(data['scrapy_time'], format = '%Y-%m-%d %H:%M:%S')

#style id
data['style_id'] = data['style_id'].astype(int)

#color id
data['color_id'] = data['color_id'].astype(int)

#color name
data['color_name'] = data['color_name'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower() if pd.notnull(x) else x)

#fit
data['Fit'] = data['Fit'].apply(lambda x: x.replace(' ', '_').lower() if pd.notnull(x) else x)

#size number
data['size_number'] = data['Size'].apply(lambda x: re.search('\d{3}', x).group(0) if pd.notnull(x) else x)


#size model
data['size_model'] = data['Size'].str.extract('(\d+/\\d+)')
data['size_model'] = data['size_model'].apply(lambda x: x.replace('/', '_') if pd.notnull(x) else x)

#dropping Size, Product safety, and More sustainable materials columns
data = data.drop(columns = ['Size', 'Product safety', 'More sustainable materials'], axis = 1)

#composition
data = data[~data['Composition'].str.contains('Shell:', na = False)]
data = data[~data['Composition'].str.contains('Lining:', na = False)]
data = data[~data['Composition'].str.contains('Pocket lining:', na = False)]

#splitting composition column to further analysis
df1 = data['Composition'].str.split(',', expand = True)

#creating new df with columns
# cotton | polyester | spandex | elasterell
df_ref = pd.DataFrame(index = np.arange(len(data)), columns = ['cotton', 'polyester', 'spandex', 'elasterell'])

# concating df1 with df_ref on the respective items of df1
# cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat([df_ref, df_cotton], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]

#polyester
df_polyester = df1.loc[df1[1].str.contains('Polyester', na = True), 1]
df_polyester.name = 'polyester'
df_ref = pd.concat([df_ref, df_polyester], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]

#elasterell
df_elasterell = df1.loc[df1[1].str.contains('Elasterell', na = True), 1]
df_elasterell.name = 'elasterell'
df_ref = pd.concat([df_ref, df_elasterell], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]


#spandex
df_spandex1 = df1.loc[df1[1].str.contains('Spandex', na = True), 1]
df_spandex1.name = 'spandex1'
df_spandex2 = df1.loc[df1[2].str.contains('Spandex', na = True), 2]
df_spandex2.name = 'spandex2'
df_ref = pd.concat([df_ref, df_spandex1, df_spandex2], axis = 1)

df_ref = df_ref.assign(**{
    'spandex': df_ref['spandex1'].fillna(df_ref['spandex2'])})

#dropping repeated columns in df_ref
df_ref.drop(columns = ['spandex1', 'spandex2'], inplace = True)

#final join with all characteristics merged
data = pd.concat([data, df_ref], axis = 1)



#getting null indexes to remove them and creating a data frame
#data_cleaned = data[~data['scrapy_time'].isnull()]
#data_cleaned.reset_index(drop=True, inplace=True)

In [66]:
data['cotton'] = data['cotton'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['elasterell'] = data['elasterell'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['spandex'] = data['spandex'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['polyester'] = data['polyester'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)


In [67]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_time,style_id,color_id,color_name,Fit,Composition,size_number,size_model,spandex,cotton,polyester,elasterell
0,,,,,NaT,,,,,,,,,,,
1,690449022.0,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449.0,22.0,black_trashed,skinny_fit,"Cotton 98%, Spandex 2%",184,31_32,0.02,0.98,,
2,,,,,NaT,,,,,,,,,,,
3,690449022.0,men_jeans_ripped,skinny_jeans,39.99,2021-12-17 13:58:54,690449.0,22.0,light_denim_blue_trashed,skinny_fit,"Cotton 98%, Spandex 2%",184,31_32,0.02,0.98,,
4,,,,,NaT,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1819,993887004.0,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887.0,4.0,black,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,,0.01,0.79,0.2,
1820,993887004.0,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887.0,4.0,denim_blue,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,,0.01,0.79,0.2,
1821,993887004.0,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887.0,4.0,denim_blue,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,,0.01,0.79,0.2,
1822,993887004.0,men_jeans_regular,hybrid_regular_denim_joggers,39.99,2021-12-17 13:58:54,993887.0,4.0,denim_blue,regular_fit,"Cotton 79%, Polyester 20%, Spandex 1%",,,0.01,0.79,0.2,
