In [9]:
# bibliotecas necessárias
import pandas as pd

# criando objeto com caminho e carregando arquivo csv
path = '/home/robson/repositorios/bike-manufacturing-company/data-lake/landing/Sales.SalesOrderHeader.csv'
loaded_csv = pd.read_csv(path, sep=';')

# criando dataframe pandas
df = pd.DataFrame(loaded_csv)

#### Análise estrutural

In [10]:
# verificando estrutura do dataframe (rows, columns)
df.shape

(31465, 26)

In [11]:
# verificando tipos de dados
df.dtypes

SalesOrderID                int64
RevisionNumber              int64
OrderDate                  object
DueDate                    object
ShipDate                   object
Status                      int64
OnlineOrderFlag             int64
SalesOrderNumber           object
PurchaseOrderNumber        object
AccountNumber              object
CustomerID                  int64
SalesPersonID             float64
TerritoryID                 int64
BillToAddressID             int64
ShipToAddressID             int64
ShipMethodID                int64
CreditCardID              float64
CreditCardApprovalCode     object
CurrencyRateID            float64
SubTotal                   object
TaxAmt                     object
Freight                    object
TotalDue                   object
Comment                   float64
rowguid                    object
ModifiedDate               object
dtype: object

In [12]:
# analisando valores unicos de cada coluna nao-numerica
df.describe(exclude='number')

Unnamed: 0,OrderDate,DueDate,ShipDate,SalesOrderNumber,PurchaseOrderNumber,AccountNumber,CreditCardApprovalCode,SubTotal,TaxAmt,Freight,TotalDue,rowguid,ModifiedDate
count,31465,31465,31465,31465,3806,31465,30334,31465,31465,31465,31465,31465,31465
unique,1124,1124,1124,31465,3806,19119,30334,4747,4745,4744,4754,31465,1124
top,2014-03-31 00:00:00.000,2014-04-12 00:00:00.000,2014-04-07 00:00:00.000,SO43659,PO522145787,10-4030-011176,105041Vi84182,357827,2862616,894568,39539884,79B65321-39CA-4115-9CBA-8FE0903E12E6,2014-04-07 00:00:00.000
freq,271,273,273,1,1,28,1,1551,1551,1551,1551,1,273


In [13]:
# analisando dados unicos das colunas numericas
df.describe(include='number')

Unnamed: 0,SalesOrderID,RevisionNumber,Status,OnlineOrderFlag,CustomerID,SalesPersonID,TerritoryID,BillToAddressID,ShipToAddressID,ShipMethodID,CreditCardID,CurrencyRateID,Comment
count,31465.0,31465.0,31465.0,31465.0,31465.0,3806.0,31465.0,31465.0,31465.0,31465.0,30334.0,13976.0,0.0
mean,59391.0,8.000953,5.0,0.87904,20170.175687,280.607987,6.090768,18263.154426,18249.192563,1.483839,9684.100448,9191.499571,
std,9083.307446,0.030864,0.0,0.326086,6261.72896,4.846965,2.958119,8210.069158,8218.429263,1.304343,5566.299591,2945.170095,
min,43659.0,8.0,5.0,0.0,11000.0,274.0,1.0,405.0,9.0,1.0,1.0,2.0,
25%,51525.0,8.0,5.0,1.0,14432.0,277.0,4.0,14080.0,14063.0,1.0,4894.25,8510.0,
50%,59391.0,8.0,5.0,1.0,19452.0,279.0,6.0,19449.0,19438.0,1.0,9719.5,10074.0,
75%,67257.0,8.0,5.0,1.0,25994.0,284.0,9.0,24678.0,24672.0,1.0,14510.75,11282.0,
max,75123.0,9.0,5.0,1.0,30118.0,290.0,10.0,29883.0,29883.0,5.0,19237.0,12431.0,


#### Análise qualitativa

In [14]:
# verificando existencia de dados nulos por coluna
df.isnull().sum()

SalesOrderID                  0
RevisionNumber                0
OrderDate                     0
DueDate                       0
ShipDate                      0
Status                        0
OnlineOrderFlag               0
SalesOrderNumber              0
PurchaseOrderNumber       27659
AccountNumber                 0
CustomerID                    0
SalesPersonID             27659
TerritoryID                   0
BillToAddressID               0
ShipToAddressID               0
ShipMethodID                  0
CreditCardID               1131
CreditCardApprovalCode     1131
CurrencyRateID            17489
SubTotal                      0
TaxAmt                        0
Freight                       0
TotalDue                      0
Comment                   31465
rowguid                       0
ModifiedDate                  0
dtype: int64

### Conclusões:

1- Coluna "Comment" com todos os campos vazios. Logo, será excluída.

2- As colunas "CreditCardID" e "CreditCardApprovalCode" podem conter informações sensíveis, portanto a coluna será excluida. As outras colunas com dados nulos serão preenchidas com a informação "uninformed".

3- Colunas com tipo "int", sem dados quantitativos, serão substituidos os tipos para "string".

4- Colunas com tipo "objeto", contendo dados de data, serão substituidos para o tipo "datetime".