In [6]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [7]:
path = '/home/joao/projetos/dsaodev/projcode/executables/'
database_name = 'database_hm.sqlite'
conn = create_engine('sqlite:///' + path + database_name, echo = False)

In [8]:
query = '''
    SELECT * FROM vitrine
    '''

In [9]:
df_raw = pd.read_sql(query, con = conn)

In [10]:
df_raw

Unnamed: 0,product_id,style_id,color_id,product_name,color_name,fit,product_price,size_number,size_model,cotton,polyester,spandex,elasterell,scrapy_datetime
0,1024256001,1024256,001,slim_jeans,black,slim_fit,19.99,185,31_32,0.99,0.65,0.01,0.0,2022-01-17 13:24:06
1,1024256002,1024256,002,slim_jeans,light_denim_blue,slim_fit,19.99,,,0.99,0.65,0.01,0.0,2022-01-17 13:24:06
2,1024256003,1024256,003,slim_jeans,light_denim_blue,slim_fit,19.99,189,31_32,0.99,0.65,0.01,0.0,2022-01-17 13:24:06
3,1024256004,1024256,004,slim_jeans,denim_blue,slim_fit,19.99,,,0.99,0.65,0.01,0.0,2022-01-17 13:24:06
4,1024256005,1024256,005,slim_jeans,dark_blue,slim_fit,19.99,,,0.99,0.65,0.01,0.0,2022-01-17 13:24:06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,0751994016,0751994,016,slim_jeans,dark_blue,slim_fit,15.99,,,0.98,0.00,0.02,0.0,2022-01-17 13:24:06
99,0751994017,0751994,017,slim_jeans,dark_denim_blue,slim_fit,16.99,,,0.98,0.00,0.02,0.0,2022-01-17 13:24:06
100,0751994018,0751994,018,slim_jeans,denim_blue,slim_fit,20.99,,,0.98,0.00,0.02,0.0,2022-01-17 13:24:06
101,0751994031,0751994,031,slim_jeans,light_blue,slim_fit,14.99,187,31_32,0.99,0.00,0.01,0.0,2022-01-17 13:24:06


In [30]:
df_raw['elasterell'].unique()

array([0.  , 0.08, 0.09])

# Data Dimension

In [12]:
df01 = df_raw.copy()

In [13]:
print(f'Number of rows {df01.shape[0]}')
print(f'Number of columns {df01.shape[1]}')

Number of rows 103
Number of columns 14


# Data Types

In [14]:
df01['scrapy_datetime'] = pd.to_datetime(df01['scrapy_datetime'])

In [15]:
df01.dtypes

product_id                 object
style_id                   object
color_id                   object
product_name               object
color_name                 object
fit                        object
product_price             float64
size_number                object
size_model                 object
cotton                    float64
polyester                 float64
spandex                   float64
elasterell                float64
scrapy_datetime    datetime64[ns]
dtype: object

# NaN Identification

In [16]:
df01.isna().sum()

product_id          0
style_id            0
color_id            0
product_name        0
color_name          0
fit                 0
product_price       0
size_number        59
size_model         65
cotton              0
polyester           0
spandex             0
elasterell          0
scrapy_datetime     0
dtype: int64

In [17]:
df01.isna().sum()/df01.shape[0]

product_id         0.000000
style_id           0.000000
color_id           0.000000
product_name       0.000000
color_name         0.000000
fit                0.000000
product_price      0.000000
size_number        0.572816
size_model         0.631068
cotton             0.000000
polyester          0.000000
spandex            0.000000
elasterell         0.000000
scrapy_datetime    0.000000
dtype: float64

# Missing Values Substitution

In [18]:
df_aux01 = df01.drop(columns = ['size_number', 'size_model']).dropna()
df_aux01.shape

(103, 12)

# Data Description

In [19]:
type(df01)

pandas.core.frame.DataFrame

In [20]:
num_attributes = df01.select_dtypes(include = ['int64', 'float64'])
cat_attributes = df01.select_dtypes(include = ['int64', 'float64', 'datetime64[ns]'])

In [31]:
#central tendencies and dispersion
d1 = num_attributes.describe()
d2 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min())).T
d3 = pd.DataFrame(num_attributes.apply(lambda x: x.skew())).T
d4 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis())).T

#concat
m1 = pd.concat([d1, d2, d3, d4]).T.reset_index()
m1.columns = ['attributes', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'range', 'skew', 'kurtosis']

In [32]:
m1

Unnamed: 0,attributes,count,mean,std,min,25%,50%,75%,max,range,skew,kurtosis
0,product_price,103.0,28.048252,10.543863,7.99,19.99,24.99,34.99,59.99,52.0,0.777812,0.699877
1,cotton,103.0,0.975243,0.054536,0.77,0.98,0.99,1.0,1.0,0.23,-3.120949,8.492849
2,polyester,103.0,0.288641,0.339024,0.0,0.0,0.0,0.65,1.0,1.0,0.517481,-1.411393
3,spandex,103.0,0.012427,0.006783,0.0,0.01,0.01,0.02,0.02,0.02,-0.341054,-0.81188
4,elasterell,103.0,0.002427,0.014105,0.0,0.0,0.0,0.0,0.09,0.09,5.713861,31.385139
