# **Análise de Dados - Shoply**

## 1. ENTENDIMENTO DOS DADOS

### 1.1 Configuração inicial

In [1]:
# Bibliotecas principais
import pandas as pd              # Manipulação de dados
import numpy as np               # Operações numéricas
import matplotlib.pyplot as plt  # Visualizações básicas
import seaborn as sns            # Visualizações estatísticas
from google.colab import drive   # Upload de arquivos do drive
from datetime import datetime
from datetime import timedelta
from pathlib import Path

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, RocCurveDisplay
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import lightgbm as lgb
import shap

In [2]:
# Configurações de display
pd.set_option("display.max_columns", None)   # Mostrar todas as colunas
pd.set_option("display.float_format", "{:,.2f}".format)  # Format numérico
sns.set(style="whitegrid", palette="viridis")  # Estilo padrão de gráficos

### 1.2 Carregamento dos dados

In [4]:
# Montar Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Caminho da pasta onde os arquivos estão
DATA_PATH = "/content/drive/MyDrive/Shoply - Desafio de dados/"

In [6]:
# Carregar cada base separadamente
orders_raw = pd.read_csv(DATA_PATH + "orders_dirty_final.csv")

### 1.3 Exploração Inicial de Dados

In [7]:
# Avaliar tamanho das bases
orders_raw.shape

(501500, 13)

In [8]:
# Avaliar head da base
orders_raw.head()

Unnamed: 0,order_id,customer_id,order_date,order_status,order_value,discount_value,sku_count,order_category,campaign_id,payment_method,delivery_state,delivered_at,estimated_delivery_date
0,1,1,2018-01-21,delivered,44.26,1.78,4,acessorios,,boleto,PR,2018-01-27,2018-01-26
1,2,2,2018-01-28,delivered,82.06,0.0,2,acessorios,,credit_card,RJ,2018-01-30,2018-01-30
2,3,3,2018-01-17,delivered,501.8,18.16,1,eletro,,boleto,CE,2018-01-27,2018-01-27
3,4,4,2018-01-02,delivered,85.76,-3.3,2,acessorios,,credit_card,SP,2018-01-04,2018-01-04
4,5,5,2018-01-29,delivered,43.77,0.0,3,acessorios,,credit_card,SP,2018-01-31,2018-01-31


In [9]:
orders_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501500 entries, 0 to 501499
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   order_id                 501500 non-null  int64  
 1   customer_id              501500 non-null  int64  
 2   order_date               501500 non-null  object 
 3   order_status             501500 non-null  object 
 4   order_value              501500 non-null  object 
 5   discount_value           501498 non-null  float64
 6   sku_count                501303 non-null  object 
 7   order_category           501500 non-null  object 
 8   campaign_id              15408 non-null   object 
 9   payment_method           500492 non-null  object 
 10  delivery_state           501500 non-null  object 
 11  delivered_at             469122 non-null  object 
 12  estimated_delivery_date  501440 non-null  object 
dtypes: float64(1), int64(2), object(10)
memory usage: 49.7+ MB
