In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns                       #visualisation
import matplotlib.pyplot as plt             #visualisation
from IPython.display import display, Markdown, display_html   # Para mostrar bonitas las cosas
%matplotlib inline

In [3]:
"""
Funciones de utilidad para al carga y transformacion del dataframe inicial
"""

# Montaje del archivo
def load_dataset(data_set):
  try:
    # montar el drive
    result = pd.read_csv(data_set, encoding='ISO-8859-1',delimiter=';')
    return result
  except Exception as e:
    # Error
    print(f"An unexpected error occurred: {e}")
    result = None


# Convert Series to Markdown table
def series_to_markdown(series):
  # Agradecimientos: Inteligencia artificial generativa
  markdown = "| Index | Value |\n|-------|-------|\n"
  for idx, value in series.items():
    markdown += f"| {idx} | {value} |\n"
  return markdown

# Columnas como string
def column_items_to_str(data_frame):
  # Convert Index to a formatted Markdown string
  str_cols = str(list(data_frame.columns)).strip("[]")[1:-1]
  return str_cols

# Renombra las columnas del data_frame para que tengan nombres validos
def rename_df_columns(data_frame):
  data_frame.columns = data_frame.columns.str.lower()           # Minusculas
  data_frame.columns = data_frame.columns.str.replace(' ','_')  # ' ' -> '_'
  data_frame.columns = data_frame.columns.str.replace('-','_')  # '-' por '_'

# EDA

## 0. Pasos previos

## 1. Carga y previsualizacion de los datos

> **Dataset**: Superstore Dataset [[link]](https://www.kaggle.com/datasets/vivek468/superstore-dataset-final)

Los atributos el dataset se muestran a continuación:
1. **Row ID**: Unique ID for each row.
2. **Order ID**: ID del pedido para cada cliente.
3. **Order Date**: Fecha del pedido del producto.
4. **Ship Date**:  Fecha de envío del producto
5. **Ship Mode**: Modo de envío especificado por el cliente.
6. **Customer ID**: ID del cliente.
7.  **Customer Name**: Nombre del cliente.
8.  **Segment**: Segmento al que pertenece el cliente.
9.  **Country**: País de residencia del cliente
10. **City**: Ciudad de residencia del cliente.
11. **State**: Estado de residencia del cliente.
12. **Postal Code**:  Código postal de cada cliente.
13. **Region**: Región a la que pertenece el cliente.
14. **Product ID**: ID del producto.
15. **Category**: Categoría del producto pedido.
16. **Sub-Category**: Subcategoría del producto pedido.
17. **Product Name**: Nombre del producto.
18. **Sales**: Ventas del producto.
19. **Quantity**: Cantidad del producto.
20. **Discount**: Descuento proporcionado.
21. **Profit**: Profit.

 - [ ] Funciones que retornen los valores asociados a los datos categoricos.
 - [ ] Producto mas vendido
 - [ ] ...


### 1.1. Carga del dataset

In [4]:
url_data = "https://github.com/tigarto/tarea-fundamentos/raw/main/data_market.csv"
df_market_original = load_dataset(data_set = url_data)

### 1.2. Previsualizacion de los datos

In [5]:
df_market_original.head(2)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/08/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2.0,0.0,419.136
1,2,CA-2016-152156,11/08/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0,0.0,219.582


### 1.3. Información básica

In [9]:
# Informacion basica del dataframe cargado
display(Markdown("### ----------- Info dataframe -----------"))
# Tamaño
(f,c) = df_market_original.shape
display(Markdown(f"* **Filas**: {f}"))
display(Markdown(f"* **Columnas**: {c}"))

# Columnas
str_cols = column_items_to_str(df_market_original)
markdown_str_cols = Markdown(f"* **Columnas:** {str_cols}")
display(markdown_str_cols)

# Tipos de datos
display(Markdown("### ----------- Tipos de datos -----------"))
display(df_market_original.dtypes)

# Datos faltantes
num_faltantes = df_market_original.isnull().any(axis=1).sum()


# Conclusiones iniciales
markdown_str = "### ----------- Conclusiones -----------\n" + \
               f"- El tipo de dato **Postal Code** debe ser categorico. \n" + \
               f"- La columna **Row ID** puede ser empleada como index del dataframe. \n" + \
               f"- Es necesario cambiar los nombres de las columnas para que sean validos. \n" + \
               f"- Hay {num_faltantes} filas con datos faltantes, de deben analizar para ver si justifica borrarlos."
display(Markdown(markdown_str))

### ----------- Info dataframe -----------

* **Filas**: 9994

* **Columnas**: 21

* **Columnas:** Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode', 'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State', 'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit

### ----------- Tipos de datos -----------

Row ID             int64
Order ID          object
Order Date        object
Ship Date         object
Ship Mode         object
Customer ID       object
Customer Name     object
Segment           object
Country           object
City              object
State             object
Postal Code        int64
Region            object
Product ID        object
Category          object
Sub-Category      object
Product Name      object
Sales             object
Quantity         float64
Discount         float64
Profit            object
dtype: object

### ----------- Conclusiones -----------
- El tipo de dato **Postal Code** debe ser categorico. 
- La columna **Row ID** puede ser empleada como index del dataframe. 
- Es necesario cambiar los nombres de las columnas para que sean validos. 
- Hay 6 filas con datos faltantes, de deben analizar para ver si justifica borrarlos.

In [11]:
# Datos nulos por columna
df_market_original.isna().sum()

Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            6
Quantity         6
Discount         6
Profit           6
dtype: int64

In [13]:
"""
Dataframe modificado de acuerdo a las conclusiones anteriormente arrojadas
"""
# Renombrado de columnas
df_market = df_market_original.copy()
rename_df_columns(df_market)

# Fijando el nuevo index
df_market.set_index('row_id',inplace= True)

# Cambio del tipo de dato asociado al codigo postal
df_market[['postal_code']] = df_market[['postal_code']].astype('object')

display(Markdown("### ----------- Tipos de datos -----------"))
display(df_market.dtypes)

# Revision de las columnas con datos faltantes
display(Markdown("### ----------- Columnas con datos faltantes -----------"))
display(df_market[df_market.quantity.isna()])
display(Markdown("\n * **Conclusión**: Si justifica eliminar las columnas pues son muy pocos datos del total"))
df_market.dropna(axis=0, inplace=True)

# Informacion despues de hacer los cambios
display(Markdown("### ----------- Actualizacion de la filas -----------"))
display(Markdown(f"\n * **Numero de columnas con datos vacios**: {df_market.isnull().any(axis=1).sum()}"))
display(Markdown(f"\n * **Filas**: {df_market.shape[0]}"))
display(Markdown(f"\n * **Columnas**: {df_market.shape[1]}"))

### ----------- Tipos de datos -----------

order_id          object
order_date        object
ship_date         object
ship_mode         object
customer_id       object
customer_name     object
segment           object
country           object
city              object
state             object
postal_code       object
region            object
product_id        object
category          object
sub_category      object
product_name      object
sales             object
quantity         float64
discount         float64
profit            object
dtype: object

### ----------- Columnas con datos faltantes -----------

Unnamed: 0_level_0,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
182,CA-2014-166191,12/05/2014,12/09/2014,Second Class,DK-13150,David Kendrick,Corporate,United States,Decatur,Illinois,62521,Central,TEC-AC-10004659,Technology,Accessories,Imation Secure+ Hardware Encrypted USB 2.0 Fla...,,,,
431,US-2016-123750,4/15/2016,4/21/2016,Standard Class,RB-19795,Ross Baird,Home Office,United States,Gastonia,North Carolina,28052,South,TEC-AC-10004659,Technology,Accessories,Imation Secure+ Hardware Encrypted USB 2.0 Fla...,,,,
432,US-2016-123750,4/15/2016,4/21/2016,Standard Class,RB-19795,Ross Baird,Home Office,United States,Gastonia,North Carolina,28052,South,TEC-AC-10004659,Technology,Accessories,Imation Secure+ Hardware Encrypted USB 2.0 Fla...,,,,
1407,US-2014-118486,4/06/2014,4/08/2014,First Class,SD-20485,Shirley Daniels,Home Office,United States,Philadelphia,Pennsylvania,19143,East,TEC-AC-10004659,Technology,Accessories,Imation Secure+ Hardware Encrypted USB 2.0 Fla...,,,,
1970,CA-2017-117485,9/23/2017,9/29/2017,Standard Class,BD-11320,Bill Donatelli,Consumer,United States,Tulsa,Oklahoma,74133,Central,TEC-AC-10004659,Technology,Accessories,Imation Secure+ Hardware Encrypted USB 2.0 Fla...,,,,
1972,CA-2017-140242,5/06/2017,5/11/2017,Standard Class,ML-17755,Max Ludwig,Home Office,United States,Chicago,Illinois,60623,Central,TEC-AC-10004659,Technology,Accessories,Imation Secure+ Hardware Encrypted USB 2.0 Fla...,,,,



 * **Conclusión**: Si justifica eliminar las columnas pues son muy pocos datos del total

### ----------- Actualizacion de la filas -----------


 * **Numero de columnas con datos vacios**: 0


 * **Filas**: 9988


 * **Columnas**: 20

In [14]:
# Despliegue del Dataframe actualizado
display(Markdown("### ----------- Dataframe actualizado -----------"))
display(df_market.head(3))
display(Markdown("> <br>**Conclusion**: <br>Podemos decir que el dataframe ya esta listo \
                  para ser actualizado para sacar informacion relevante. El dataframe que\
                  contendra esta informacion se llamara `df_market`"))


### ----------- Dataframe actualizado -----------

Unnamed: 0_level_0,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,CA-2016-152156,11/08/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2.0,0.0,419.136
2,CA-2016-152156,11/08/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0,0.0,219.582
3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2.0,0.0,68.714


> <br>**Conclusion**: <br>Podemos decir que el dataframe ya esta listo                   para ser actualizado para sacar informacion relevante. El dataframe que                  contendra esta informacion se llamara `df_market`

In [15]:
# Guardando el archivo para actualizar
url_data_to_analize = "https://github.com/tigarto/tarea-fundamentos/raw/main/data_market2.csv"
df_market = pd.read_csv(url_data_to_analize)
df_market.set_index('row_id',inplace= True)
df_market.head(2)

Unnamed: 0_level_0,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,CA-2016-152156,11/08/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2.0,0.0,419.136
2,CA-2016-152156,11/08/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0,0.0,219.582


## 2. Analisis de los datos

In [65]:

df_market = pd.read_csv('data_market2.csv')
df_market[['postal_code']] = df_market[['postal_code']].astype('object')
df_market.set_index('row_id',inplace= True)
df_market.head(2)

Unnamed: 0_level_0,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,CA-2016-152156,11/08/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2.0,0.0,419.136
2,CA-2016-152156,11/08/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0,0.0,219.582


### Tareas

-  [x] Mostrar una tabla con ....
-  [ ] Elemento de lista



In [67]:
# Funciones de utidad para el analisis sobre los datos del dataframe a analizar
# To do...

from pandas.api.types import is_object_dtype, is_numeric_dtype, is_bool_dtype


# Obtiene el numero de items y los items de las columnas categoricas
def get_unique_column_items(df, col_name):
    if is_object_dtype(df[col_name]):
        items = df_market[col_name].unique()
        num_items = df_market[col_name].nunique()
        return [num_items, items.tolist()]
    else:
        return None


[1334,
 ['11/11/2016',
  '6/16/2016',
  '10/18/2015',
  '6/14/2014',
  '4/20/2017',
  '12/10/2016',
  '11/26/2015',
  '11/18/2014',
  '5/15/2014',
  '9/01/2014',
  '12/13/2016',
  '7/18/2017',
  '9/30/2015',
  '1/20/2016',
  '9/21/2015',
  '10/23/2017',
  '12/31/2015',
  '9/15/2017',
  '7/22/2016',
  '9/23/2017',
  '3/13/2016',
  '10/25/2014',
  '6/25/2016',
  '4/22/2015',
  '12/17/2016',
  '6/18/2016',
  '11/30/2015',
  '5/05/2015',
  '12/10/2014',
  '6/06/2016',
  '9/23/2016',
  '9/17/2017',
  '5/02/2015',
  '12/11/2017',
  '12/01/2014',
  '6/15/2016',
  '10/16/2014',
  '9/08/2015',
  '11/16/2017',
  '5/30/2017',
  '11/02/2017',
  '4/10/2016',
  '9/22/2016',
  '2/05/2015',
  '11/12/2017',
  '11/11/2017',
  '6/20/2017',
  '9/11/2016',
  '9/02/2016',
  '12/04/2016',
  '11/17/2015',
  '11/28/2017',
  '10/20/2015',
  '12/30/2017',
  '11/10/2016',
  '8/27/2014',
  '3/06/2015',
  '4/10/2015',
  '12/28/2014',
  '9/25/2014',
  '2/05/2017',
  '10/19/2016',
  '9/07/2016',
  '12/27/2017',
  '9/

## Funciones de resumen de los datos limpiados 

### Forma del dataframe

In [30]:
# Forma del dataframe
df_market.head(2)

Unnamed: 0_level_0,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,CA-2016-152156,11/08/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2.0,0.0,419.136
2,CA-2016-152156,11/08/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0,0.0,219.582


### Informacion 

In [31]:
# Informacion sobre el dataframe
df_market.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9988 entries, 1 to 9994
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_id       9988 non-null   object 
 1   order_date     9988 non-null   object 
 2   ship_date      9988 non-null   object 
 3   ship_mode      9988 non-null   object 
 4   customer_id    9988 non-null   object 
 5   customer_name  9988 non-null   object 
 6   segment        9988 non-null   object 
 7   country        9988 non-null   object 
 8   city           9988 non-null   object 
 9   state          9988 non-null   object 
 10  postal_code    9988 non-null   object 
 11  region         9988 non-null   object 
 12  product_id     9988 non-null   object 
 13  category       9988 non-null   object 
 14  sub_category   9988 non-null   object 
 15  product_name   9988 non-null   object 
 16  sales          9988 non-null   object 
 17  quantity       9988 non-null   float64
 18  discount

### Columnas

In [35]:
df_market.columns

Index(['order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_id',
       'customer_name', 'segment', 'country', 'city', 'state', 'postal_code',
       'region', 'product_id', 'category', 'sub_category', 'product_name',
       'sales', 'quantity', 'discount', 'profit'],
      dtype='object')

### Columnas con datos numericos

In [37]:
df_market.describe()

Unnamed: 0,quantity,discount
count,9988.0,9988.0
mean,3.788646,0.156196
std,2.224976,0.206506
min,1.0,0.0
25%,2.0,0.0
50%,3.0,0.2
75%,5.0,0.2
max,14.0,0.8


### Columnas con datos categoricos

In [38]:
df_market.describe(include = 'object')

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub_category,product_name,sales,profit
count,9988,9988,9988,9988,9988,9988,9988,9988,9988,9988,9988,9988,9988,9988,9988,9988,9988.0,9988
unique,5008,1237,1334,4,793,793,3,1,531,49,631,4,1861,3,17,1849,5822.0,7172
top,CA-2017-100111,9/05/2016,12/16/2015,Standard Class,WB-21850,William Brown,Consumer,United States,New York City,California,10035,West,OFF-PA-10001970,Office Supplies,Binders,Staple envelope,12.96,0
freq,14,38,35,5964,37,37,5190,9988,915,2001,263,3203,19,6026,1523,48,56.0,65


ship mode (**Hacer una funcion**)

In [71]:
# Informacion columnas

cat_cols = df_market.select_dtypes(include=['object']).columns.tolist()

for col in cat_cols:
    (n_items, items) = get_column_items(df_market, col)
    display(Markdown(f"* **{col}**: {n_items}"))



* **order_id**: 5008

* **order_date**: 1237

* **ship_date**: 1334

* **ship_mode**: 4

* **customer_id**: 793

* **customer_name**: 793

* **segment**: 3

* **country**: 1

* **city**: 531

* **state**: 49

* **postal_code**: 631

* **region**: 4

* **product_id**: 1861

* **category**: 3

* **sub_category**: 17

* **product_name**: 1849

* **sales**: 5822

* **profit**: 7172

In [74]:
# 
df_market.state.unique()

array(['Kentucky', 'California', 'Florida', 'North Carolina',
       'Washington', 'Texas', 'Wisconsin', 'Utah', 'Nebraska',
       'Pennsylvania', 'Illinois', 'Minnesota', 'Michigan', 'Delaware',
       'Indiana', 'New York', 'Arizona', 'Virginia', 'Tennessee',
       'Alabama', 'South Carolina', 'Oregon', 'Colorado', 'Iowa', 'Ohio',
       'Missouri', 'Oklahoma', 'New Mexico', 'Louisiana', 'Connecticut',
       'New Jersey', 'Massachusetts', 'Georgia', 'Nevada', 'Rhode Island',
       'Mississippi', 'Arkansas', 'Montana', 'New Hampshire', 'Maryland',
       'District of Columbia', 'Kansas', 'Vermont', 'Maine',
       'South Dakota', 'Idaho', 'North Dakota', 'Wyoming',
       'West Virginia'], dtype=object)

In [75]:
# customer_id
df_market.groupby('state')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020290306E80>

In [79]:
type(df_market.sales.sum())

str

In [77]:
df_market.groupby('state')[['sales']].sum()

Unnamed: 0_level_0,sales
state,Unnamed: 1_level_1
Alabama,208.1616.74979.9522.7512.48152.767.271819.8670...
Arizona,1.113.024167.9682.388243.992157.92203.1848.161...
Arkansas,105.426.24699.9322.9638.66.6323.341067.9422.28...
California,14.6248.867.28907.15218.504114.91.706.184911.4...
Colorado,238.896102.3636.882218.752.6300.416230.352218....
Connecticut,7.1623.27.36104.791043.9222.211.34552.5627.462...
Delaware,4521.847.0430.84226.56115.0268.049.9426.187.32...
District of Columbia,22.741267.531379.9219.449.6412.741.3740.0837.6...
Florida,9.575.77522.36895.616233.866.206.1455.328258.0...
Georgia,647.8420.72.748.3446.746354.95392.9412.39279.9...


In [23]:
# customer_name
df_market.customer_name.nunique()

793

In [24]:
# order_date
df_market[['order_date']].nunique()

order_date    1237
dtype: int64

In [None]:
# ship_date
df_market[['ship_date']].nunique()

In [None]:
# segment
df_market[['segment']].nunique()

In [None]:
df_market.segment.unique()

In [None]:
# country
df_market.country.nunique()

In [None]:
df_market.country.unique()

In [None]:
df_market.state.unique()

In [None]:
# city
df_market.city.nunique()

In [None]:
df_market.region.unique()

In [None]:
# product_id
df_market.product_id.nunique()

In [None]:
# category
df_market.category.unique()

In [None]:
# sub_category
df_market.sub_category.unique()

In [None]:
# product_name
df_market.product_name.unique()

In [None]:
# product_id
df_market.product_id.unique()

## Conclusiones del trabajo

* Conclusion 1
* Conclusion 2
* Conclusion 3
* ...

## Referencias consultadas

