# Ejercicios de Pandas 

<img src="../images/pandas_animal.jpg">

In [19]:
! pip install pandas



In [20]:
import numpy as np
import pandas as pd

## Estructuras de datos

### Series
- Las series son una matriz etiquetada unidimensional capaz de contener cualquier tipo de datos (enteros, cadenas, números de punto flotante, objetos de Python, etc.)
- Las etiquetas de los ejes se denominan colectivamente índice. 

El método básico para crear una serie es llamar a:
```
s = pd.Series (datos, índice = índice) 
```
Aquí, los datos pueden ser muchas cosas diferentes: 
    - un dict de Python 
    - un ndarray 
    - un valor escalar (como 5) 

El índice pasado es una lista de etiquetas de eje.

#### Series desde un np.array

In [21]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a   -0.937518
b    1.754940
c   -0.010112
d   -1.769732
e   -1.167569
dtype: float64

In [22]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

Si no indicamos nada en los índices, por defecto se creará una secuencia numérica

In [23]:
pd.Series(np.random.randn(5))

0    0.710221
1    1.448506
2   -0.061820
3    0.386662
4   -0.070793
dtype: float64

#### Series desde un diccionario

In [24]:
d = {'b': 1, 'a': 0, 'c': 2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

#### Series desde un valor escalar

In [25]:
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

#### Usos de las series
Se pueden usar como un array:

In [26]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [27]:
s[0: 3]

a   -1.794983
b    1.162205
c   -0.263513
dtype: float64

In [28]:
s[s > s.median()]

b    1.162205
d    0.664064
dtype: float64

In [29]:
s.to_numpy()


array([-1.79498287,  1.16220535, -0.26351335,  0.66406399, -0.91814387])

O como un diccionario:

In [30]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    0.853985
b    0.323230
c   -1.408038
d    2.321071
e   -0.780502
dtype: float64

In [31]:
s['e'] = 12
s

a     0.853985
b     0.323230
c    -1.408038
d     2.321071
e    12.000000
dtype: float64

## DataFrames

- Es una estructura de datos bidimensional etiquetada con columnas de tipos potencialmente diferentes. 
- Se puede considerar como una hoja de cálculo o una tabla SQL. 
- Generalmente es el objeto pandas más utilizado.
- Al igual que Series, DataFrame acepta muchos tipos diferentes de entrada: diccionario de arrays de 1D, listas, dicts, series u otros DataFrame
- Junto con los datos, opcionalmente puede pasar el índice (etiquetas de fila) y los nombres de las columnas. 
- Si no se pasan las etiquetas de los ejes, se construirán a partir de los datos de entrada basados en reglas propias.

#### DataFrames desde diccionarios de Series o diccionarios normales

In [32]:
import pandas as pd
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [33]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [34]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


#### DataFrames desde diccionarios de ndarrays / lists

In [35]:
d = {'one': [1., 2., 3., 4.],
     'two': [4., 3., 2., 1.]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [36]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [37]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [38]:
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

Unnamed: 0,A,B,C,D
2013-01-01,-3.353151,-0.014407,-0.004781,0.724638
2013-01-02,-0.96702,0.334202,0.77763,1.098115
2013-01-03,0.499839,-0.504918,-0.385097,0.156298
2013-01-04,1.929412,-1.111304,0.885713,-0.940772
2013-01-05,-0.946143,-0.166227,0.164693,1.693442
2013-01-06,-0.560412,1.599882,-0.262207,-0.747903


In [39]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [40]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

#### DataFrames desde listas de diccionarios

In [41]:
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data2)


Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


#### DataFrames desde diccionarios de tuplas (multiindex!)

In [42]:
print(pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
              ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
              ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
              ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
              ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
     )

       a              b      
       b    a    c    a     b
A B  1.0  4.0  5.0  8.0  10.0
  C  2.0  3.0  6.0  7.0   NaN
  D  NaN  NaN  NaN  NaN   9.0


### Visualización de los datos

In [43]:
import numpy as np
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df 

Unnamed: 0,A,B,C,D
2013-01-01,-1.210454,1.53276,0.971092,1.649423
2013-01-02,-2.113341,-0.103699,0.529289,-0.490216
2013-01-03,0.78032,-1.83878,-1.308783,-1.624337
2013-01-04,0.957576,-1.796472,0.418392,-1.14989
2013-01-05,-0.73693,-0.531873,-0.084335,0.398129
2013-01-06,0.314571,0.852188,-0.346146,1.645365


In [44]:
df.head(10)

Unnamed: 0,A,B,C,D
2013-01-01,-1.210454,1.53276,0.971092,1.649423
2013-01-02,-2.113341,-0.103699,0.529289,-0.490216
2013-01-03,0.78032,-1.83878,-1.308783,-1.624337
2013-01-04,0.957576,-1.796472,0.418392,-1.14989
2013-01-05,-0.73693,-0.531873,-0.084335,0.398129
2013-01-06,0.314571,0.852188,-0.346146,1.645365


In [45]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.957576,-1.796472,0.418392,-1.14989
2013-01-05,-0.73693,-0.531873,-0.084335,0.398129
2013-01-06,0.314571,0.852188,-0.346146,1.645365


In [46]:
df.to_numpy()

array([[-1.21045427,  1.53275972,  0.97109237,  1.64942323],
       [-2.11334113, -0.10369858,  0.52928913, -0.49021623],
       [ 0.78032027, -1.83878036, -1.30878281, -1.62433661],
       [ 0.95757599, -1.79647233,  0.41839186, -1.14989018],
       [-0.73692956, -0.53187292, -0.08433501,  0.39812904],
       [ 0.31457118,  0.8521883 , -0.34614631,  1.64536468]])

In [47]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [48]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.33471,-0.314313,0.029918,0.071412
std,1.218778,1.370029,0.804076,1.396908
min,-2.113341,-1.83878,-1.308783,-1.624337
25%,-1.092073,-1.480322,-0.280693,-0.984972
50%,-0.211179,-0.317786,0.167028,-0.046044
75%,0.663883,0.613217,0.501565,1.333556
max,0.957576,1.53276,0.971092,1.649423


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [50]:
print(f'{df}\n\n')
df.T

                   A         B         C         D
2013-01-01 -1.210454  1.532760  0.971092  1.649423
2013-01-02 -2.113341 -0.103699  0.529289 -0.490216
2013-01-03  0.780320 -1.838780 -1.308783 -1.624337
2013-01-04  0.957576 -1.796472  0.418392 -1.149890
2013-01-05 -0.736930 -0.531873 -0.084335  0.398129
2013-01-06  0.314571  0.852188 -0.346146  1.645365




Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.210454,-2.113341,0.78032,0.957576,-0.73693,0.314571
B,1.53276,-0.103699,-1.83878,-1.796472,-0.531873,0.852188
C,0.971092,0.529289,-1.308783,0.418392,-0.084335,-0.346146
D,1.649423,-0.490216,-1.624337,-1.14989,0.398129,1.645365


In [54]:
print(f'{df}\n\n')

df.sort_index(axis=1, ascending=False)



                   A         B         C         D
2013-01-01 -1.210454  1.532760  0.971092  1.649423
2013-01-02 -2.113341 -0.103699  0.529289 -0.490216
2013-01-03  0.780320 -1.838780 -1.308783 -1.624337
2013-01-04  0.957576 -1.796472  0.418392 -1.149890
2013-01-05 -0.736930 -0.531873 -0.084335  0.398129
2013-01-06  0.314571  0.852188 -0.346146  1.645365




Unnamed: 0,D,C,B,A
2013-01-01,1.649423,0.971092,1.53276,-1.210454
2013-01-02,-0.490216,0.529289,-0.103699,-2.113341
2013-01-03,-1.624337,-1.308783,-1.83878,0.78032
2013-01-04,-1.14989,0.418392,-1.796472,0.957576
2013-01-05,0.398129,-0.084335,-0.531873,-0.73693
2013-01-06,1.645365,-0.346146,0.852188,0.314571


In [61]:
print(f'{df}\n\n')
df.sort_values(by='B')

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




Unnamed: 0,A,B,C,D
2013-01-01,-0.146988,-1.475498,-0.11487,-1.347381
2013-01-05,1.029033,-0.390519,-1.302474,2.111136
2013-01-02,-1.794238,0.037753,1.45593,-1.791382
2013-01-06,-0.244644,0.845592,0.617941,0.059702
2013-01-04,-0.206344,1.045239,-0.641316,-0.903017
2013-01-03,0.143859,1.284888,-0.401138,0.010571


Si seleccionamos una columna, nos va a devolver una Serie:

In [64]:
print(f'{df}\n\n')
df['A']

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




2013-01-01   -0.146988
2013-01-02   -1.794238
2013-01-03    0.143859
2013-01-04   -0.206344
2013-01-05    1.029033
2013-01-06   -0.244644
Freq: D, Name: A, dtype: float64

Si seleccionamos elementos a través de [], nos devolverá las filas:

In [55]:
print(f'{df}\n\n')
df[1:3]

                   A         B         C         D
2013-01-01 -1.210454  1.532760  0.971092  1.649423
2013-01-02 -2.113341 -0.103699  0.529289 -0.490216
2013-01-03  0.780320 -1.838780 -1.308783 -1.624337
2013-01-04  0.957576 -1.796472  0.418392 -1.149890
2013-01-05 -0.736930 -0.531873 -0.084335  0.398129
2013-01-06  0.314571  0.852188 -0.346146  1.645365




Unnamed: 0,A,B,C,D
2013-01-02,-2.113341,-0.103699,0.529289,-0.490216
2013-01-03,0.78032,-1.83878,-1.308783,-1.624337


In [66]:
print(f'{df}\n\n')
df['20130102':'20130104']

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




Unnamed: 0,A,B,C,D
2013-01-02,-1.794238,0.037753,1.45593,-1.791382
2013-01-03,0.143859,1.284888,-0.401138,0.010571
2013-01-04,-0.206344,1.045239,-0.641316,-0.903017


### Filtrado

#### Selección por etiqueta

In [56]:
print(f'{df}\n\n')
print(df.loc[dates[0]])

                   A         B         C         D
2013-01-01 -1.210454  1.532760  0.971092  1.649423
2013-01-02 -2.113341 -0.103699  0.529289 -0.490216
2013-01-03  0.780320 -1.838780 -1.308783 -1.624337
2013-01-04  0.957576 -1.796472  0.418392 -1.149890
2013-01-05 -0.736930 -0.531873 -0.084335  0.398129
2013-01-06  0.314571  0.852188 -0.346146  1.645365


A   -1.210454
B    1.532760
C    0.971092
D    1.649423
Name: 2013-01-01 00:00:00, dtype: float64


In [58]:
print(f'{df}\n\n')
print(df.loc[:, ['A', 'B']])

                   A         B         C         D
2013-01-01 -1.210454  1.532760  0.971092  1.649423
2013-01-02 -2.113341 -0.103699  0.529289 -0.490216
2013-01-03  0.780320 -1.838780 -1.308783 -1.624337
2013-01-04  0.957576 -1.796472  0.418392 -1.149890
2013-01-05 -0.736930 -0.531873 -0.084335  0.398129
2013-01-06  0.314571  0.852188 -0.346146  1.645365


                   A         B
2013-01-01 -1.210454  1.532760
2013-01-02 -2.113341 -0.103699
2013-01-03  0.780320 -1.838780
2013-01-04  0.957576 -1.796472
2013-01-05 -0.736930 -0.531873
2013-01-06  0.314571  0.852188


In [68]:
print(f'{df}\n\n')
df.loc['20130102':'20130104', ['A', 'B']]

                   A         B         C         D
2013-01-01 -1.210454  1.532760  0.971092  1.649423
2013-01-02 -2.113341 -0.103699  0.529289 -0.490216
2013-01-03  0.780320 -1.838780 -1.308783 -1.624337
2013-01-04  0.957576 -1.796472  0.418392 -1.149890
2013-01-05 -0.736930 -0.531873 -0.084335  0.398129
2013-01-06  0.314571  0.852188 -0.346146  1.645365




Unnamed: 0,A,B
2013-01-02,-2.113341,-0.103699
2013-01-03,0.78032,-1.83878
2013-01-04,0.957576,-1.796472


In [70]:
print(f'{df}\n\n')
df.at[dates[0], 'A']

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




-0.14698768898630427

#### Selección por posición

In [71]:
print(f'{df}\n\n')
df.iloc[3]

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




A   -0.206344
B    1.045239
C   -0.641316
D   -0.903017
Name: 2013-01-04 00:00:00, dtype: float64

In [72]:
print(f'{df}\n\n')
df.iloc[3:5, 0:2]

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




Unnamed: 0,A,B
2013-01-04,-0.206344,1.045239
2013-01-05,1.029033,-0.390519


In [73]:
print(f'{df}\n\n')
df.iloc[[1, 2, 4], [0, 2]]

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




Unnamed: 0,A,C
2013-01-02,-1.794238,1.45593
2013-01-03,0.143859,-0.401138
2013-01-05,1.029033,-1.302474


In [74]:
print(f'{df}\n\n')
df.iat[1, 1]

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




0.03775276553916763

#### Selección por condición (boolean indexing)

In [76]:
print(f'{df}\n\n')
df[df.A > 0]

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




Unnamed: 0,A,B,C,D
2013-01-03,0.143859,1.284888,-0.401138,0.010571
2013-01-05,1.029033,-0.390519,-1.302474,2.111136


In [77]:
print(f'{df}\n\n')
df[df > 0]

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,0.037753,1.45593,
2013-01-03,0.143859,1.284888,,0.010571
2013-01-04,,1.045239,,
2013-01-05,1.029033,,,2.111136
2013-01-06,,0.845592,0.617941,0.059702


Podemos usar el operador isin() para filtrar de una lista de valores

In [78]:
print(f'{df}\n\n')
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




Unnamed: 0,A,B,C,D,E
2013-01-01,-0.146988,-1.475498,-0.11487,-1.347381,one
2013-01-02,-1.794238,0.037753,1.45593,-1.791382,one
2013-01-03,0.143859,1.284888,-0.401138,0.010571,two
2013-01-04,-0.206344,1.045239,-0.641316,-0.903017,three
2013-01-05,1.029033,-0.390519,-1.302474,2.111136,four
2013-01-06,-0.244644,0.845592,0.617941,0.059702,three


In [79]:
print(f'{df}\n\n')
df2[df2['E'].isin(['two', 'four'])]

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




Unnamed: 0,A,B,C,D,E
2013-01-03,0.143859,1.284888,-0.401138,0.010571,two
2013-01-05,1.029033,-0.390519,-1.302474,2.111136,four


### Operaciones básicas

In [80]:
print(f'{df}\n\n')
df.mean()

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




A   -0.203220
B    0.224576
C   -0.064321
D   -0.310062
dtype: float64

In [81]:
df.mean(1)

2013-01-01   -0.771184
2013-01-02   -0.522984
2013-01-03    0.259545
2013-01-04   -0.176360
2013-01-05    0.361794
2013-01-06    0.319648
Freq: D, dtype: float64

In [85]:
# Return the cumulative sum of the elements along a given axis

print(f'{df}\n\n')
df.apply(np.cumsum)

                   A         B         C         D
2013-01-01 -0.146988 -1.475498 -0.114870 -1.347381
2013-01-02 -1.794238  0.037753  1.455930 -1.791382
2013-01-03  0.143859  1.284888 -0.401138  0.010571
2013-01-04 -0.206344  1.045239 -0.641316 -0.903017
2013-01-05  1.029033 -0.390519 -1.302474  2.111136
2013-01-06 -0.244644  0.845592  0.617941  0.059702




Unnamed: 0,A,B,C,D
2013-01-01,-0.146988,-1.475498,-0.11487,-1.347381
2013-01-02,-1.941226,-1.437745,1.34106,-3.138763
2013-01-03,-1.797367,-0.152857,0.939922,-3.128192
2013-01-04,-2.003711,0.892382,0.298606,-4.031209
2013-01-05,-0.974678,0.501862,-1.003868,-1.920074
2013-01-06,-1.219322,1.347454,-0.385926,-1.860372


#### Histogramas

In [83]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    5
1    1
2    3
3    5
4    3
5    5
6    1
7    2
8    6
9    1
dtype: int64

In [86]:
s.value_counts()

5    3
1    3
3    2
6    1
2    1
dtype: int64

#### Agrupar dataframes
Pandas proporciona varias facilidades para combinar fácilmente los objetos Series, DataFrame y Panel con varios tipos de lógica de conjunto para los índices y la funcionalidad de álgebra relacional en el caso de operaciones de join / merge

In [87]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.127023,-1.07721,-1.495688,0.296253
1,0.351437,-0.628223,-0.167878,-0.74192
2,1.337147,1.527646,-1.73648,-0.086348
3,-0.035537,0.462223,1.195604,0.858935
4,-2.458707,0.896013,-0.382886,-0.508811
5,0.014639,0.187263,1.225029,-0.066607
6,-0.661612,-0.37558,0.397379,1.281453
7,-0.312275,-0.213588,2.548251,1.17208
8,-0.729266,-0.483419,0.72079,0.541362
9,1.18129,0.606431,0.645601,0.201969


In [89]:
print(f'{df}\n\n')
pieces = [df[:3], df[3:7], df[7:]]
pieces

          0         1         2         3
0  0.127023 -1.077210 -1.495688  0.296253
1  0.351437 -0.628223 -0.167878 -0.741920
2  1.337147  1.527646 -1.736480 -0.086348
3 -0.035537  0.462223  1.195604  0.858935
4 -2.458707  0.896013 -0.382886 -0.508811
5  0.014639  0.187263  1.225029 -0.066607
6 -0.661612 -0.375580  0.397379  1.281453
7 -0.312275 -0.213588  2.548251  1.172080
8 -0.729266 -0.483419  0.720790  0.541362
9  1.181290  0.606431  0.645601  0.201969




[          0         1         2         3
 0  0.127023 -1.077210 -1.495688  0.296253
 1  0.351437 -0.628223 -0.167878 -0.741920
 2  1.337147  1.527646 -1.736480 -0.086348,
           0         1         2         3
 3 -0.035537  0.462223  1.195604  0.858935
 4 -2.458707  0.896013 -0.382886 -0.508811
 5  0.014639  0.187263  1.225029 -0.066607
 6 -0.661612 -0.375580  0.397379  1.281453,
           0         1         2         3
 7 -0.312275 -0.213588  2.548251  1.172080
 8 -0.729266 -0.483419  0.720790  0.541362
 9  1.181290  0.606431  0.645601  0.201969]

In [90]:
print(f'{pieces}\n\n')
pd.concat(pieces)

[          0         1         2         3
0  0.127023 -1.077210 -1.495688  0.296253
1  0.351437 -0.628223 -0.167878 -0.741920
2  1.337147  1.527646 -1.736480 -0.086348,           0         1         2         3
3 -0.035537  0.462223  1.195604  0.858935
4 -2.458707  0.896013 -0.382886 -0.508811
5  0.014639  0.187263  1.225029 -0.066607
6 -0.661612 -0.375580  0.397379  1.281453,           0         1         2         3
7 -0.312275 -0.213588  2.548251  1.172080
8 -0.729266 -0.483419  0.720790  0.541362
9  1.181290  0.606431  0.645601  0.201969]




Unnamed: 0,0,1,2,3
0,0.127023,-1.07721,-1.495688,0.296253
1,0.351437,-0.628223,-0.167878,-0.74192
2,1.337147,1.527646,-1.73648,-0.086348
3,-0.035537,0.462223,1.195604,0.858935
4,-2.458707,0.896013,-0.382886,-0.508811
5,0.014639,0.187263,1.225029,-0.066607
6,-0.661612,-0.37558,0.397379,1.281453
7,-0.312275,-0.213588,2.548251,1.17208
8,-0.729266,-0.483419,0.72079,0.541362
9,1.18129,0.606431,0.645601,0.201969


In [92]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print(f'{left} \n\n')
print(right)

   key  lval
0  foo     1
1  bar     2 


   key  rval
0  foo     4
1  bar     5


In [93]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


In [94]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,-0.548861,-0.111712,0.40242,0.31077
1,-0.641012,0.881862,0.340285,-0.051068
2,-2.833875,-0.428328,-0.105955,2.4912
3,0.999763,0.664696,-2.579138,-0.862724
4,1.085667,0.827074,0.860993,-2.288346
5,-1.286304,-0.212147,0.592805,1.131484
6,1.141034,0.601052,0.496447,-0.461918
7,0.648167,1.055235,-1.048979,0.103533


In [95]:
s = df.iloc[3]
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-0.548861,-0.111712,0.40242,0.31077
1,-0.641012,0.881862,0.340285,-0.051068
2,-2.833875,-0.428328,-0.105955,2.4912
3,0.999763,0.664696,-2.579138,-0.862724
4,1.085667,0.827074,0.860993,-2.288346
5,-1.286304,-0.212147,0.592805,1.131484
6,1.141034,0.601052,0.496447,-0.461918
7,0.648167,1.055235,-1.048979,0.103533
8,0.999763,0.664696,-2.579138,-0.862724


#### Group by

In [96]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B': ['one', 'one', 'two', 'three',
                         'two', 'two', 'one', 'three'],
                   'C': np.random.randn(8),
                   'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.501056,-2.924574
1,bar,one,-0.549976,0.017807
2,foo,two,1.014823,0.018131
3,bar,three,0.733106,1.056786
4,foo,two,0.004953,1.806528
5,bar,two,-1.63198,1.502073
6,foo,one,1.494127,-0.304147
7,foo,three,-0.589815,-0.690538


In [97]:
print(f'{df}\n\n')
df.groupby('A').sum()

     A      B         C         D
0  foo    one -0.501056 -2.924574
1  bar    one -0.549976  0.017807
2  foo    two  1.014823  0.018131
3  bar  three  0.733106  1.056786
4  foo    two  0.004953  1.806528
5  bar    two -1.631980  1.502073
6  foo    one  1.494127 -0.304147
7  foo  three -0.589815 -0.690538




Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.448851,2.576665
foo,1.423032,-2.0946


In [98]:
print(f'{df}\n\n')
df.groupby(['A', 'B']).sum()

     A      B         C         D
0  foo    one -0.501056 -2.924574
1  bar    one -0.549976  0.017807
2  foo    two  1.014823  0.018131
3  bar  three  0.733106  1.056786
4  foo    two  0.004953  1.806528
5  bar    two -1.631980  1.502073
6  foo    one  1.494127 -0.304147
7  foo  three -0.589815 -0.690538




Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.549976,0.017807
bar,three,0.733106,1.056786
bar,two,-1.63198,1.502073
foo,one,0.993071,-3.228722
foo,three,-0.589815,-0.690538
foo,two,1.019776,1.82466
