## Reshaping con objetos de DataFrame

In [4]:
import numpy as np
import pandas as pd
import pandas._testing as tm

In [5]:
def unpivot(frame):
    N, K = frame.shape
    data = {
        "value": frame.to_numpy().ravel("F"),
        "variable": np.asarray(frame.columns).repeat(N),
        "date": np.tile(np.asarray(frame.index), K),
    }
    return pd.DataFrame(data, columns=["date", "variable", "value"])


In [6]:
df = unpivot(tm.makeTimeDataFrame(3))

In [7]:
df

Unnamed: 0,date,variable,value
0,2000-01-03,A,0.30346
1,2000-01-04,A,0.223136
2,2000-01-05,A,0.536628
3,2000-01-03,B,-0.964065
4,2000-01-04,B,0.59861
5,2000-01-05,B,1.082319
6,2000-01-03,C,0.98364
7,2000-01-04,C,-0.896869
8,2000-01-05,C,0.148936
9,2000-01-03,D,0.232305


In [8]:
# Para seleccionar todo para la variable A, podríamos hacer
filtered = df[df["variable"] == "A"]

In [9]:
filtered

Unnamed: 0,date,variable,value
0,2000-01-03,A,0.30346
1,2000-01-04,A,0.223136
2,2000-01-05,A,0.536628


### DataFrame.pivot()

#### Para remodelar los datos en esta forma, usamos el método DataFrame.pivot() (también implementado como una función de nivel superior pivot()

In [12]:
# Cuando transformamos un data frame a tabla dínamica debemos definir, de donde proviene la información
# index
# columns
# values

pivoted = df.pivot(index="date", columns="variable", values="value")

In [11]:
pivoted

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,0.30346,-0.964065,0.98364,0.232305
2000-01-04,0.223136,0.59861,-0.896869,-0.026508
2000-01-05,0.536628,1.082319,0.148936,-0.945854


In [13]:
# Si no definen los valores, automáticamente será creada una tabla jerárquica
df["value2"] = df["value"] * 2

In [14]:
pivoted = df.pivot(index="date", columns="variable")

In [15]:
pivoted

Unnamed: 0_level_0,value,value,value,value,value2,value2,value2,value2
variable,A,B,C,D,A,B,C,D
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2000-01-03,0.30346,-0.964065,0.98364,0.232305,0.60692,-1.92813,1.967281,0.46461
2000-01-04,0.223136,0.59861,-0.896869,-0.026508,0.446272,1.197219,-1.793739,-0.053016
2000-01-05,0.536628,1.082319,0.148936,-0.945854,1.073256,2.164638,0.297872,-1.891707


In [16]:
# Después de aplicar pivot a la tabla, podremos hacer selecciones como de costumbre
pivoted["value2"]

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,0.60692,-1.92813,1.967281,0.46461
2000-01-04,0.446272,1.197219,-1.793739,-0.053016
2000-01-05,1.073256,2.164638,0.297872,-1.891707


## Reshaping apilando y desapilando (by stacking and unstacking) - stack() and unstack() 

#### Estrechamente relacionados con el método pivot() están los métodos relacionados stack() y unstack() disponibles en Series y DataFrame. Estos métodos están diseñados para funcionar junto con objetos MultiIndex.

#### stack(): "pivot" un nivel de las etiquetas de columna (posiblemente jerárquicas), devolviendo un DataFrame con un índice con un nuevo nivel más interno de etiquetas de fila.

#### unstack(): (operación inversa de stack()) "pivot" un nivel del índice de fila (posiblemente jerárquico) al eje de la columna, produciendo un DataFrame remodelado con un nuevo nivel más interno de etiquetas de columna

In [17]:
tuples = list(
    zip(
        *[
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
    )
)

In [18]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

In [19]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])

In [22]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.852334,-1.263781
bar,two,-0.015823,0.192393
baz,one,0.870592,1.213986
baz,two,0.320067,1.052152
foo,one,1.137447,1.382694
foo,two,1.316248,-0.235494
qux,one,0.224005,-0.126731
qux,two,1.031316,-0.514244


In [20]:
df2 = df[:4]

In [21]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.852334,-1.263781
bar,two,-0.015823,0.192393
baz,one,0.870592,1.213986
baz,two,0.320067,1.052152


In [23]:
# La función stack() "comprime" un nivel en las columnas de DataFrame
# Si las columnas tienen un índice múltiple, puede elegir qué nivel apilar. 
# El nivel apilado se convierte en el nuevo nivel más bajo en un índice múltiple en las columnas
stacked = df2.stack()

In [24]:
stacked

first  second   
bar    one     A   -0.852334
               B   -1.263781
       two     A   -0.015823
               B    0.192393
baz    one     A    0.870592
               B    1.213986
       two     A    0.320067
               B    1.052152
dtype: float64

In [25]:
# Para descomprimir o volver al estado original utilizamos unstack()
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.852334,-1.263781
bar,two,-0.015823,0.192393
baz,one,0.870592,1.213986
baz,two,0.320067,1.052152


In [26]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-0.852334,-0.015823
bar,B,-1.263781,0.192393
baz,A,0.870592,0.320067
baz,B,1.213986,1.052152


In [27]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.852334,0.870592
one,B,-1.263781,1.213986
two,A,-0.015823,0.320067
two,B,0.192393,1.052152


In [28]:
# Si los indices tienen nombres, podemos utilizarlos para aplicar la funcion
stacked.unstack("second")

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-0.852334,-0.015823
bar,B,-1.263781,0.192393
baz,A,0.870592,0.320067
baz,B,1.213986,1.052152


## Apilar y desapilar con Múltiples Niveles

#### También puede apilar o desapilar más de un nivel a la vez pasando una lista de niveles, en cuyo caso el resultado final es como si cada nivel de la lista se procesara individualmente.

In [29]:
columns = pd.MultiIndex.from_tuples(
    [
        ("A", "cat", "long"),
        ("B", "cat", "long"),
        ("A", "dog", "short"),
        ("B", "dog", "short"),
    ],
    names=["exp", "animal", "hair_length"],
)

In [30]:
df = pd.DataFrame(np.random.randn(4, 4), columns=columns)

In [31]:
df

exp,A,B,A,B
animal,cat,cat,dog,dog
hair_length,long,long,short,short
0,0.445209,-0.1115,2.434675,-0.259929
1,1.111994,1.519075,1.113558,-1.55858
2,-1.010088,-0.99328,-0.843242,0.078998
3,0.480633,2.198066,0.01942,0.698096


In [32]:
df.stack(level=["animal", "hair_length"])

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
Unnamed: 0_level_1,animal,hair_length,Unnamed: 3_level_1,Unnamed: 4_level_1
0,cat,long,0.445209,-0.1115
0,dog,short,2.434675,-0.259929
1,cat,long,1.111994,1.519075
1,dog,short,1.113558,-1.55858
2,cat,long,-1.010088,-0.99328
2,dog,short,-0.843242,0.078998
3,cat,long,0.480633,2.198066
3,dog,short,0.01942,0.698096


In [33]:
# También podemos identificar los niveles por valores numéricos
df.stack(level=[1, 2])

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
Unnamed: 0_level_1,animal,hair_length,Unnamed: 3_level_1,Unnamed: 4_level_1
0,cat,long,0.445209,-0.1115
0,dog,short,2.434675,-0.259929
1,cat,long,1.111994,1.519075
1,dog,short,1.113558,-1.55858
2,cat,long,-1.010088,-0.99328
2,dog,short,-0.843242,0.078998
3,cat,long,0.480633,2.198066
3,dog,short,0.01942,0.698096


## Reshaping con melt - Convertir valores a columnas

La función **melt()** de nivel superior y el **DataFrame.melt()** correspondiente son útiles para masajear un DataFrame en un formato en el que una o más columnas son variables de identificación, mientras que todas las demás columnas, consideradas variables medidas, están "sin pivotar" en la fila. eje, dejando solo dos columnas sin identificador, "variable" y "valor". Los nombres de esas columnas se pueden personalizar proporcionando los parámetros var_name y value_name

In [34]:
cheese = pd.DataFrame(
    {
        "first": ["John", "Mary"],
        "last": ["Doe", "Bo"],
        "height": [5.5, 6.0],
        "weight": [130, 150],
    }
)

In [35]:
cheese

Unnamed: 0,first,last,height,weight
0,John,Doe,5.5,130
1,Mary,Bo,6.0,150


In [36]:
# Ahora aplicaremos melt(), y convertiremos "first" y "last", en dos nuevas columnas
cheese.melt(id_vars=["first", "last"])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [37]:
cheese.melt(id_vars=["first", "last"], var_name="quantity")

Unnamed: 0,first,last,quantity,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [41]:
# Veamos ahora un ejemplo, con un multiindice
index = pd.MultiIndex.from_tuples([("person", "A"), ("person", "B")])

In [42]:
cheese = pd.DataFrame(
    {
        "first": ["John", "Mary"],
        "last": ["Doe", "Bo"],
        "height": [5.5, 6.0],
        "weight": [130, 150],
    },
    index=index,
)

In [43]:
cheese

Unnamed: 0,Unnamed: 1,first,last,height,weight
person,A,John,Doe,5.5,130
person,B,Mary,Bo,6.0,150


In [44]:
cheese.melt(id_vars=["first", "last"])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [45]:
# Para conservar los índices originales, debemos usar el argumento "ignore_index=False"
cheese.melt(id_vars=["first", "last"], ignore_index=False)

Unnamed: 0,Unnamed: 1,first,last,variable,value
person,A,John,Doe,height,5.5
person,B,Mary,Bo,height,6.0
person,A,John,Doe,weight,130.0
person,B,Mary,Bo,weight,150.0


### wide_to_long() similar a melt()

In [46]:
dft = pd.DataFrame(
    {
        "A1970": {0: "a", 1: "b", 2: "c"},
        "A1980": {0: "d", 1: "e", 2: "f"},
        "B1970": {0: 2.5, 1: 1.2, 2: 0.7},
        "B1980": {0: 3.2, 1: 1.3, 2: 0.1},
        "X": dict(zip(range(3), np.random.randn(3))),
    }
)

In [47]:
dft["id"] = dft.index

In [48]:
dft

Unnamed: 0,A1970,A1980,B1970,B1980,X,id
0,a,d,2.5,3.2,0.19594,0
1,b,e,1.2,1.3,0.588591,1
2,c,f,0.7,0.1,0.150961,2


In [49]:
pd.wide_to_long(dft, ["A", "B"], i="id", j="year")

Unnamed: 0_level_0,Unnamed: 1_level_0,X,A,B
id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1970,0.19594,a,2.5
1,1970,0.588591,b,1.2
2,1970,0.150961,c,0.7
0,1980,0.19594,d,3.2
1,1980,0.588591,e,1.3
2,1980,0.150961,f,0.1


## Combinando con estadísticas y GroupBy

#### No debería sorprender que la combinación de pivot() / stack() / unstack() con GroupBy y las funciones estadísticas básicas de Series y DataFrame pueda producir algunas manipulaciones de datos muy expresivas y rápidas.

In [54]:
columns = pd.MultiIndex.from_tuples(
    [
        ("A", "cat"),
        ("B", "dog"),
        ("B", "cat"),
        ("A", "dog"),
    ],
    names=["exp", "animal"],
)

In [56]:
index = pd.MultiIndex.from_product(
    [("bar", "baz", "foo", "qux"), ("one", "two")], names=["first", "second"]
)

In [58]:
df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns)

In [59]:
df

Unnamed: 0_level_0,exp,A,B,B,A
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,0.341072,-0.613694,-0.458615,1.400419
bar,two,2.228768,-0.137789,0.339085,-1.807528
baz,one,0.354889,-0.143549,0.590079,0.701782
baz,two,-0.28601,0.093535,1.459081,-0.34441
foo,one,0.513846,-1.427431,-0.691387,-1.117198
foo,two,-0.768869,0.007087,-0.324393,-0.202964
qux,one,-1.826545,0.811314,1.138981,1.897945
qux,two,-1.208721,0.181832,0.256169,1.643016


In [60]:
# Podemos ver la media de cada columna
df.stack().mean(1).unstack()

Unnamed: 0_level_0,animal,cat,dog
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.058772,0.393363
bar,two,1.283926,-0.972658
baz,one,0.472484,0.279116
baz,two,0.586535,-0.125437
foo,one,-0.088771,-1.272315
foo,two,-0.546631,-0.097939
qux,one,-0.343782,1.35463
qux,two,-0.476276,0.912424


In [61]:
df.groupby(level=1, axis=1).mean()

Unnamed: 0_level_0,animal,cat,dog
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.058772,0.393363
bar,two,1.283926,-0.972658
baz,one,0.472484,0.279116
baz,two,0.586535,-0.125437
foo,one,-0.088771,-1.272315
foo,two,-0.546631,-0.097939
qux,one,-0.343782,1.35463
qux,two,-0.476276,0.912424


In [62]:
df.stack().groupby(level=1).mean()

exp,A,B
second,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0.283276,-0.099288
two,-0.09334,0.234326


In [63]:
df.mean().unstack(0)

exp,A,B
animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,-0.081446,0.288625
dog,0.271383,-0.153587


## Tablas dinámicas - Pivot Tables - pivot_table() 

#### La función pivot_table() se puede usar para crear tablas dinámicas al estilo de una hoja de cálculo. Tiene los siguientes argumentos

#### 1. data: a DataFrame object.
#### 2. values: a column or a list of columns to aggregate.
#### 3. index: a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table index.
#### 4. columns: a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table column.
#### 5. aggfunc: function to use for aggregation, defaulting to numpy.mean.

In [64]:
import datetime

In [65]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 6,
        "B": ["A", "B", "C"] * 8,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
        "D": np.random.randn(24),
        "E": np.random.randn(24),
        "F": [datetime.datetime(2013, i, 1) for i in range(1, 13)]
        + [datetime.datetime(2013, i, 15) for i in range(1, 13)],
    }
)

In [67]:
df.head(10)

Unnamed: 0,A,B,C,D,E,F
0,one,A,foo,0.334915,-1.473241,2013-01-01
1,one,B,foo,-0.89928,-0.485105,2013-02-01
2,two,C,foo,1.199145,-0.765235,2013-03-01
3,three,A,bar,-0.150724,0.569081,2013-04-01
4,one,B,bar,1.655458,-0.273584,2013-05-01
5,one,C,bar,-0.104854,-0.4004,2013-06-01
6,two,A,foo,1.650828,-1.456718,2013-07-01
7,three,B,foo,-1.561268,3.919242,2013-08-01
8,one,C,foo,0.220828,0.588927,2013-09-01
9,one,A,bar,-0.764537,-0.648978,2013-10-01


In [68]:
# Vamos a crear una tabla dinámica
pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.253236,0.826218
one,B,0.769927,-0.848733
one,C,-0.823565,0.385051
three,A,0.346411,
three,B,,-0.598831
three,C,0.872067,
two,A,,0.640759
two,B,-0.279023,
two,C,,-0.129406


In [69]:
pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc=np.sum)

A,one,one,three,three,two,two
C,bar,foo,bar,foo,bar,foo
B,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,-0.506472,1.652436,0.692821,,,1.281518
B,1.539855,-1.697466,,-1.197661,-0.558045,
C,-1.647131,0.770101,1.744135,,,-0.258813


In [70]:
pd.pivot_table(
    df, values=["D", "E"],
    index=["B"],
    columns=["A", "C"],
    aggfunc=np.sum,
)

Unnamed: 0_level_0,D,D,D,D,D,D,E,E,E,E,E,E
A,one,one,three,three,two,two,one,one,three,three,two,two
C,bar,foo,bar,foo,bar,foo,bar,foo,bar,foo,bar,foo
B,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
A,-0.506472,1.652436,0.692821,,,1.281518,-2.22374,0.145691,-0.489828,,,0.204539
B,1.539855,-1.697466,,-1.197661,-0.558045,,1.281528,1.245934,,4.425121,0.938225,
C,-1.647131,0.770101,1.744135,,,-0.258813,1.354066,0.3841,-0.065588,,,-0.626744


In [71]:
pd.pivot_table(df[["A", "B", "C", "D", "E"]], index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,E,E
Unnamed: 0_level_1,C,bar,foo,bar,foo
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
one,A,-0.253236,0.826218,-1.11187,0.072846
one,B,0.769927,-0.848733,0.640764,0.622967
one,C,-0.823565,0.385051,0.677033,0.19205
three,A,0.346411,,-0.244914,
three,B,,-0.598831,,2.21256
three,C,0.872067,,-0.032794,
two,A,,0.640759,,0.10227
two,B,-0.279023,,0.469112,
two,C,,-0.129406,,-0.313372


In [73]:
# Además, puede usar Grouper para palabras clave de índice y columnas
pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C")

C,bar,foo
F,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-31,,0.826218
2013-02-28,,-0.848733
2013-03-31,,-0.129406
2013-04-30,0.346411,
2013-05-31,0.769927,
2013-06-30,-0.823565,
2013-07-31,,0.640759
2013-08-31,,-0.598831
2013-09-30,,0.385051
2013-10-31,-0.253236,


In [74]:
table = df.pivot_table(
    index=["A", "B"],
    columns="C",
    values=["D", "E"],
    margins=True,
    aggfunc=np.std
)

In [75]:
table

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,D,E,E,E
Unnamed: 0_level_1,C,bar,foo,All,bar,foo,All
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
one,A,0.723088,0.694807,0.850654,0.654628,2.186497,1.484684
one,B,1.25233,0.071485,1.1823,1.293083,1.567051,1.173034
one,C,1.016412,0.232246,0.921554,1.52372,0.561268,0.978426
three,A,0.703055,,0.703055,1.151163,,1.151163
three,B,,1.361092,1.361092,,2.413613,2.413613
three,C,1.529483,,1.529483,0.332196,,0.332196
two,A,,1.428453,1.428453,,2.204741,2.204741
two,B,1.262802,,1.262802,0.757354,,0.757354
two,C,,1.878855,1.878855,,0.63903,0.63903
All,,1.044737,1.070856,1.013278,1.006862,1.55891,1.273425


In [78]:
table.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,D,E
A,B,C,Unnamed: 3_level_1,Unnamed: 4_level_1
one,A,All,0.850654,1.484684
one,A,bar,0.723088,0.654628
one,A,foo,0.694807,2.186497
one,B,All,1.1823,1.173034
one,B,bar,1.25233,1.293083
one,B,foo,0.071485,1.567051
one,C,All,0.921554,0.978426
one,C,bar,1.016412,1.52372
one,C,foo,0.232246,0.561268
three,A,All,0.703055,1.151163


## Normalización

#### Las tablas de frecuencia también se pueden normalizar para mostrar porcentajes en lugar de recuentos mediante el argumento de normalización

In [79]:
pd.crosstab(df["A"], df["B"], normalize=True)

B,A,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0.166667,0.166667,0.166667
three,0.083333,0.083333,0.083333
two,0.083333,0.083333,0.083333


In [80]:
pd.crosstab(df["A"], df["B"], normalize="columns")

B,A,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0.5,0.5,0.5
three,0.25,0.25,0.25
two,0.25,0.25,0.25


In [81]:
pd.crosstab(df["A"], df["B"], values=df["C"], aggfunc=np.sum)

B,A,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,foobarfoobar,foobarfoobar,barfoobarfoo
three,barbar,foofoo,barbar
two,foofoo,barbar,foofoo


## Tiling - función cut()

#### La función cut() calcula agrupaciones para los valores de la matriz de entrada y, a menudo, se usa para transformar variables continuas en variables discretas o categóricas

In [82]:
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])

In [83]:
pd.cut(ages, bins=3)

[(9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (26.667, 43.333], (43.333, 60.0], (43.333, 60.0]]
Categories (3, interval[float64, right]): [(9.95, 26.667] < (26.667, 43.333] < (43.333, 60.0]]

In [84]:
c = pd.cut(ages, bins=[0, 18, 35, 70])

In [85]:
c

[(0, 18], (0, 18], (0, 18], (0, 18], (18, 35], (18, 35], (18, 35], (35, 70], (35, 70]]
Categories (3, interval[int64, right]): [(0, 18] < (18, 35] < (35, 70]]

## Indicador de cálculo / variables ficticias - Pasar variables categoricas a 0 y 1 get_dummies()

#### Para convertir una variable categórica en un DataFrame "ficticio" o "indicador", por ejemplo, una columna en un DataFrame (una Serie) que tiene k valores distintos, puede derivar un DataFrame que contiene k columnas de 1 y 0 usando get_dummies()

In [86]:
df = pd.DataFrame({"key": list("bbacab"), "data1": range(6)})

In [87]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [88]:
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [89]:
# En algunas ocasiones, es recomendable utilizar un prefijo
dummies = pd.get_dummies(df["key"], prefix="key")

In [90]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [91]:
df[["data1"]].join(dummies)

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [93]:
# Ejemplo con Data Frame
df = pd.DataFrame({"A": ["a", "b", "a"], "B": ["c", "c", "b"], "C": [1, 2, 3]})

In [94]:
df

Unnamed: 0,A,B,C
0,a,c,1
1,b,c,2
2,a,b,3


In [95]:
pd.get_dummies(df)

Unnamed: 0,C,A_a,A_b,B_b,B_c
0,1,1,0,0,1
1,2,0,1,0,1
2,3,1,0,1,0


In [96]:
df = pd.DataFrame({"A": list("aaaaa"), "B": list("ababc")})

In [97]:
pd.get_dummies(df)

Unnamed: 0,A_a,B_a,B_b,B_c
0,1,1,0,0
1,1,0,1,0
2,1,1,0,0
3,1,0,1,0
4,1,0,0,1


In [98]:
pd.get_dummies(df, drop_first=True)

Unnamed: 0,B_b,B_c
0,0,0
1,1,0
2,0,0
3,1,0
4,0,1


## Ejemplos

In [99]:
np.random.seed([3, 1415])

In [100]:
n = 20

In [101]:
cols = np.array(["key", "row", "item", "col"])

In [102]:
df = cols + pd.DataFrame(
    (np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str)
)

In [103]:
df.columns = cols

In [104]:
df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix("val"))

In [106]:
df.head(10)

Unnamed: 0,key,row,item,col,val0,val1
0,key0,row3,item1,col3,0.81,0.04
1,key1,row2,item1,col2,0.44,0.07
2,key1,row0,item1,col0,0.77,0.01
3,key0,row4,item0,col2,0.15,0.59
4,key1,row0,item2,col1,0.81,0.64
5,key1,row2,item2,col4,0.13,0.88
6,key2,row4,item1,col3,0.88,0.39
7,key1,row4,item1,col1,0.1,0.07
8,key1,row0,item2,col4,0.65,0.02
9,key1,row2,item0,col2,0.35,0.61


### Pivotar con agregaciones individuales

In [107]:
# Vamos a crear una pivot table en base a la tabla anterior
df.pivot_table(values="val0", index="row", columns="col", aggfunc="mean")

col,col0,col1,col2,col3,col4
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
row0,0.77,0.605,,0.86,0.65
row2,0.13,,0.395,0.5,0.25
row3,,0.31,,0.545,
row4,,0.1,0.395,0.76,0.24


In [108]:
# Podemos utilizar el argumento "fill_value", para reemplazar todos los valores NA
df.pivot_table(
    values="val0",
    index="row",
    columns="col",
    aggfunc="mean",
    fill_value=0,
)

col,col0,col1,col2,col3,col4
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
row0,0.77,0.605,0.0,0.86,0.65
row2,0.13,0.0,0.395,0.5,0.25
row3,0.0,0.31,0.0,0.545,0.0
row4,0.0,0.1,0.395,0.76,0.24


In [109]:
df.pivot_table(
    values="val0",
    index="row",
    columns="col",
    aggfunc="sum",
    fill_value=0,
)

col,col0,col1,col2,col3,col4
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
row0,0.77,1.21,0.0,0.86,0.65
row2,0.13,0.0,0.79,0.5,0.5
row3,0.0,0.31,0.0,1.09,0.0
row4,0.0,0.1,0.79,1.52,0.24


In [110]:
df.pivot_table(index="row", columns="col", fill_value=0, aggfunc="size")

col,col0,col1,col2,col3,col4
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
row0,1,2,0,1,1
row2,1,0,2,1,2
row3,0,1,0,2,0
row4,0,1,2,2,1


## Pivotar con múltiples agregaciones

#### También podemos realizar múltiples agregaciones. Por ejemplo, para realizar una suma y una media, podemos pasar una lista al argumento aggfunc.

In [111]:
df.pivot_table(
    values="val0",
    index="row",
    columns="col",
    aggfunc=["mean", "sum"],
)

Unnamed: 0_level_0,mean,mean,mean,mean,mean,sum,sum,sum,sum,sum
col,col0,col1,col2,col3,col4,col0,col1,col2,col3,col4
row,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
row0,0.77,0.605,,0.86,0.65,0.77,1.21,,0.86,0.65
row2,0.13,,0.395,0.5,0.25,0.13,,0.79,0.5,0.5
row3,,0.31,,0.545,,,0.31,,1.09,
row4,,0.1,0.395,0.76,0.24,,0.1,0.79,1.52,0.24


In [112]:
df.pivot_table(
    values=["val0", "val1"],
    index="row",
    columns="col",
    aggfunc=["mean"],
)

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,val0,val0,val0,val0,val0,val1,val1,val1,val1,val1
col,col0,col1,col2,col3,col4,col0,col1,col2,col3,col4
row,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
row0,0.77,0.605,,0.86,0.65,0.01,0.745,,0.01,0.02
row2,0.13,,0.395,0.5,0.25,0.45,,0.34,0.44,0.79
row3,,0.31,,0.545,,,0.23,,0.075,
row4,,0.1,0.395,0.76,0.24,,0.07,0.42,0.3,0.46


In [113]:
df.pivot_table(
    values=["val0"],
    index="row",
    columns=["item", "col"],
    aggfunc=["mean"],
)

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,val0,val0,val0,val0,val0,val0,val0,val0,val0,val0,val0,val0
item,item0,item0,item0,item1,item1,item1,item1,item1,item2,item2,item2,item2
col,col2,col3,col4,col0,col1,col2,col3,col4,col0,col1,col3,col4
row,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
row0,,,,0.77,,,,,,0.605,0.86,0.65
row2,0.35,,0.37,,,0.44,,,0.13,,0.5,0.13
row3,,,,,0.31,,0.81,,,,0.28,
row4,0.15,0.64,,,0.1,0.64,0.88,0.24,,,,


## Pasar de una lista a una columna de datos - explode()

In [114]:
keys = ["panda1", "panda2", "panda3"]

In [115]:
values = [["eats", "shoots"], ["shoots", "leaves"], ["eats", "leaves"]]

In [116]:
df = pd.DataFrame({"keys": keys, "values": values})

In [117]:
df

Unnamed: 0,keys,values
0,panda1,"[eats, shoots]"
1,panda2,"[shoots, leaves]"
2,panda3,"[eats, leaves]"


In [118]:
# Para poder separar los valores en una columna, utilizaremos "explode()"
df["values"].explode()

0      eats
0    shoots
1    shoots
1    leaves
2      eats
2    leaves
Name: values, dtype: object

In [119]:
df.explode("values")

Unnamed: 0,keys,values
0,panda1,eats
0,panda1,shoots
1,panda2,shoots
1,panda2,leaves
2,panda3,eats
2,panda3,leaves


In [120]:
df = pd.DataFrame([{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}])

In [121]:
df

Unnamed: 0,var1,var2
0,"a,b,c",1
1,"d,e,f",2


In [122]:
df.assign(var1=df.var1.str.split(",")).explode("var1")

Unnamed: 0,var1,var2
0,a,1
0,b,1
0,c,1
1,d,2
1,e,2
1,f,2
