In [2]:
# Pandas es el Excel de python.
# Obtenido de: https://pandas.pydata.org/docs/user_guide/10min.html

import pandas as pd
import numpy as np

In [3]:
# Una columna en pandas se llama una serie
# np.nan es un valor faltante
# Cada elemento tiene un índice.

s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [4]:
# El índice puede ser una fecha

fechas = pd.date_range("20130101", periods = 6, freq = "D")

print(fechas)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


In [5]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index = fechas, name = "datos")

print(s)

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, Name: datos, dtype: float64


In [6]:
# Un dataframe es un conjunto de columnas.

# creado a partir de series.
df = pd.DataFrame( np.random.randn(6, 4), index = fechas,
    columns = ["A", "B", "C", "D"] )

print(df)

                   A         B         C         D
2013-01-01 -0.849531 -1.218918 -0.782622  0.340801
2013-01-02  0.900871  0.312723 -0.151387 -0.000137
2013-01-03 -0.389075 -1.201781  0.320880  1.703846
2013-01-04  0.275994 -0.308872 -1.123060 -0.235346
2013-01-05 -0.706344 -1.444777 -2.783718 -0.068565
2013-01-06 -1.565061  1.288668  1.714380 -0.865340


In [7]:
# head() y tail() muestran la parte 
# superior e inferior del Dataframe.

print(df.head())
print(df.tail())

                   A         B         C         D
2013-01-01 -0.849531 -1.218918 -0.782622  0.340801
2013-01-02  0.900871  0.312723 -0.151387 -0.000137
2013-01-03 -0.389075 -1.201781  0.320880  1.703846
2013-01-04  0.275994 -0.308872 -1.123060 -0.235346
2013-01-05 -0.706344 -1.444777 -2.783718 -0.068565
                   A         B         C         D
2013-01-02  0.900871  0.312723 -0.151387 -0.000137
2013-01-03 -0.389075 -1.201781  0.320880  1.703846
2013-01-04  0.275994 -0.308872 -1.123060 -0.235346
2013-01-05 -0.706344 -1.444777 -2.783718 -0.068565
2013-01-06 -1.565061  1.288668  1.714380 -0.865340


In [8]:
# Podemos acceder al índice y las columnas.

print(df.index)
print(df.columns)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


In [9]:
# Trasponer.

print(df.T)

   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A   -0.849531    0.900871   -0.389075    0.275994   -0.706344   -1.565061
B   -1.218918    0.312723   -1.201781   -0.308872   -1.444777    1.288668
C   -0.782622   -0.151387    0.320880   -1.123060   -2.783718    1.714380
D    0.340801   -0.000137    1.703846   -0.235346   -0.068565   -0.865340


In [10]:
# Convertir a un arreglo de numpy.

print( df.to_numpy() )

[[-8.49530715e-01 -1.21891776e+00 -7.82622448e-01  3.40800951e-01]
 [ 9.00871372e-01  3.12722775e-01 -1.51387499e-01 -1.36875919e-04]
 [-3.89075337e-01 -1.20178059e+00  3.20879696e-01  1.70384634e+00]
 [ 2.75993670e-01 -3.08871628e-01 -1.12305956e+00 -2.35345679e-01]
 [-7.06344377e-01 -1.44477676e+00 -2.78371839e+00 -6.85648204e-02]
 [-1.56506137e+00  1.28866829e+00  1.71437970e+00 -8.65340440e-01]]


In [11]:
# Estadística descriptiva.

print( df.describe() )

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.388858 -0.428826 -0.467588  0.145877
std    0.871998  1.074051  1.508376  0.860286
min   -1.565061 -1.444777 -2.783718 -0.865340
25%   -0.813734 -1.214633 -1.037950 -0.193650
50%   -0.547710 -0.755326 -0.467005 -0.034351
75%    0.109726  0.157324  0.202813  0.255566
max    0.900871  1.288668  1.714380  1.703846


In [12]:
# Ordenar datos.

# Por índice.

print( df.sort_index(ascending = False) )

                   A         B         C         D
2013-01-06 -1.565061  1.288668  1.714380 -0.865340
2013-01-05 -0.706344 -1.444777 -2.783718 -0.068565
2013-01-04  0.275994 -0.308872 -1.123060 -0.235346
2013-01-03 -0.389075 -1.201781  0.320880  1.703846
2013-01-02  0.900871  0.312723 -0.151387 -0.000137
2013-01-01 -0.849531 -1.218918 -0.782622  0.340801


In [13]:
# Por columna.
print( df.sort_values(by = "B") )

                   A         B         C         D
2013-01-05 -0.706344 -1.444777 -2.783718 -0.068565
2013-01-01 -0.849531 -1.218918 -0.782622  0.340801
2013-01-03 -0.389075 -1.201781  0.320880  1.703846
2013-01-04  0.275994 -0.308872 -1.123060 -0.235346
2013-01-02  0.900871  0.312723 -0.151387 -0.000137
2013-01-06 -1.565061  1.288668  1.714380 -0.865340


In [14]:
# Escoger una columna.

print( df["A"] )

2013-01-01   -0.849531
2013-01-02    0.900871
2013-01-03   -0.389075
2013-01-04    0.275994
2013-01-05   -0.706344
2013-01-06   -1.565061
Freq: D, Name: A, dtype: float64


In [15]:
# Escoger una fila.

print( df.loc[df.index[0]] )

A   -0.849531
B   -1.218918
C   -0.782622
D    0.340801
Name: 2013-01-01 00:00:00, dtype: float64


In [16]:
# Escoger por número de fila.

print( df.iloc[0] )

A   -0.849531
B   -1.218918
C   -0.782622
D    0.340801
Name: 2013-01-01 00:00:00, dtype: float64


In [17]:
# Escoger por fila y columna.

print( df.loc[df.index[2], "B"] )

-1.2017805855427672


In [18]:
# Escoger por número de fila y columna.

print( df.iloc[2, 1] )

-1.2017805855427672


In [19]:
# Escoger una subsección del dataframe.

df_2 = df.iloc[2:4, 1:3]
print( df_2 )

                   B        C
2013-01-03 -1.201781  0.32088
2013-01-04 -0.308872 -1.12306


In [20]:
df_2 = df.iloc[2:4, :]
print( df_2 )

# Las copias y vistas funcionan igual que en numpy!!

                   A         B        C         D
2013-01-03 -0.389075 -1.201781  0.32088  1.703846
2013-01-04  0.275994 -0.308872 -1.12306 -0.235346


In [21]:
# Búsqueda por condiciones.

# En una columna.
print( df[df["A"] > 0] )

                   A         B         C         D
2013-01-02  0.900871  0.312723 -0.151387 -0.000137
2013-01-04  0.275994 -0.308872 -1.123060 -0.235346


In [22]:
# En todo el dataframe.

print( df[df>0] )

                   A         B        C         D
2013-01-01       NaN       NaN      NaN  0.340801
2013-01-02  0.900871  0.312723      NaN       NaN
2013-01-03       NaN       NaN  0.32088  1.703846
2013-01-04  0.275994       NaN      NaN       NaN
2013-01-05       NaN       NaN      NaN       NaN
2013-01-06       NaN  1.288668  1.71438       NaN


In [23]:
# Agregar columnas.
# Todos los elementos de una columna son del mismo tipo.
# Las columnas pueden tener distintos tipos.

df["E"] = ["uno", "uno", "dos", "tres", "cuatro", "tres"]

print(df)

                   A         B         C         D       E
2013-01-01 -0.849531 -1.218918 -0.782622  0.340801     uno
2013-01-02  0.900871  0.312723 -0.151387 -0.000137     uno
2013-01-03 -0.389075 -1.201781  0.320880  1.703846     dos
2013-01-04  0.275994 -0.308872 -1.123060 -0.235346    tres
2013-01-05 -0.706344 -1.444777 -2.783718 -0.068565  cuatro
2013-01-06 -1.565061  1.288668  1.714380 -0.865340    tres


In [24]:
# Nueva columna con arreglos.

df["F"] = np.linspace(1, 5, 6)
print(df.head())

                   A         B         C         D       E    F
2013-01-01 -0.849531 -1.218918 -0.782622  0.340801     uno  1.0
2013-01-02  0.900871  0.312723 -0.151387 -0.000137     uno  1.8
2013-01-03 -0.389075 -1.201781  0.320880  1.703846     dos  2.6
2013-01-04  0.275994 -0.308872 -1.123060 -0.235346    tres  3.4
2013-01-05 -0.706344 -1.444777 -2.783718 -0.068565  cuatro  4.2


In [25]:
# Buscar valores específicos con isin()

print( df[df["E"].isin(["dos", "cuatro"])] )

                   A         B         C         D       E    F
2013-01-03 -0.389075 -1.201781  0.320880  1.703846     dos  2.6
2013-01-05 -0.706344 -1.444777 -2.783718 -0.068565  cuatro  4.2


In [26]:
# Podemos cambiar un valor específico.

df.iloc[0, 1] = 10
print(df)

                   A          B         C         D       E    F
2013-01-01 -0.849531  10.000000 -0.782622  0.340801     uno  1.0
2013-01-02  0.900871   0.312723 -0.151387 -0.000137     uno  1.8
2013-01-03 -0.389075  -1.201781  0.320880  1.703846     dos  2.6
2013-01-04  0.275994  -0.308872 -1.123060 -0.235346    tres  3.4
2013-01-05 -0.706344  -1.444777 -2.783718 -0.068565  cuatro  4.2
2013-01-06 -1.565061   1.288668  1.714380 -0.865340    tres  5.0


In [27]:
# Quitar columna.

df = df.drop("E", axis = 1)
print(df)

                   A          B         C         D    F
2013-01-01 -0.849531  10.000000 -0.782622  0.340801  1.0
2013-01-02  0.900871   0.312723 -0.151387 -0.000137  1.8
2013-01-03 -0.389075  -1.201781  0.320880  1.703846  2.6
2013-01-04  0.275994  -0.308872 -1.123060 -0.235346  3.4
2013-01-05 -0.706344  -1.444777 -2.783718 -0.068565  4.2
2013-01-06 -1.565061   1.288668  1.714380 -0.865340  5.0


In [28]:
# Establecer valores con condicional.

df[df<0] = 0
print(df)

                   A          B        C         D    F
2013-01-01  0.000000  10.000000  0.00000  0.340801  1.0
2013-01-02  0.900871   0.312723  0.00000  0.000000  1.8
2013-01-03  0.000000   0.000000  0.32088  1.703846  2.6
2013-01-04  0.275994   0.000000  0.00000  0.000000  3.4
2013-01-05  0.000000   0.000000  0.00000  0.000000  4.2
2013-01-06  0.000000   1.288668  1.71438  0.000000  5.0


In [29]:
# Agregamos algunos valores nulos.

df[df>2] = np.nan
print(df)

                   A         B        C         D    F
2013-01-01  0.000000       NaN  0.00000  0.340801  1.0
2013-01-02  0.900871  0.312723  0.00000  0.000000  1.8
2013-01-03  0.000000  0.000000  0.32088  1.703846  NaN
2013-01-04  0.275994  0.000000  0.00000  0.000000  NaN
2013-01-05  0.000000  0.000000  0.00000  0.000000  NaN
2013-01-06  0.000000  1.288668  1.71438  0.000000  NaN


In [30]:
# Quitar filas con valores nulos.
print(df.dropna())

# Rellenar valores nulos.
print(df.fillna(-100))

# Determinar valores nulos.
print(df.isna())

                   A         B    C    D    F
2013-01-02  0.900871  0.312723  0.0  0.0  1.8
                   A           B        C         D      F
2013-01-01  0.000000 -100.000000  0.00000  0.340801    1.0
2013-01-02  0.900871    0.312723  0.00000  0.000000    1.8
2013-01-03  0.000000    0.000000  0.32088  1.703846 -100.0
2013-01-04  0.275994    0.000000  0.00000  0.000000 -100.0
2013-01-05  0.000000    0.000000  0.00000  0.000000 -100.0
2013-01-06  0.000000    1.288668  1.71438  0.000000 -100.0
                A      B      C      D      F
2013-01-01  False   True  False  False  False
2013-01-02  False  False  False  False  False
2013-01-03  False  False  False  False   True
2013-01-04  False  False  False  False   True
2013-01-05  False  False  False  False   True
2013-01-06  False  False  False  False   True


In [31]:
# operaciones.

# Entre columnas.
df["G"] = df["A"] + df["C"]

print(df)

# promedio
print(df.mean())
print(df.mean(1))

                   A         B        C         D    F         G
2013-01-01  0.000000       NaN  0.00000  0.340801  1.0  0.000000
2013-01-02  0.900871  0.312723  0.00000  0.000000  1.8  0.900871
2013-01-03  0.000000  0.000000  0.32088  1.703846  NaN  0.320880
2013-01-04  0.275994  0.000000  0.00000  0.000000  NaN  0.275994
2013-01-05  0.000000  0.000000  0.00000  0.000000  NaN  0.000000
2013-01-06  0.000000  1.288668  1.71438  0.000000  NaN  1.714380
A    0.196144
B    0.320278
C    0.339210
D    0.340775
F    1.400000
G    0.535354
dtype: float64
2013-01-01    0.268160
2013-01-02    0.652411
2013-01-03    0.469121
2013-01-04    0.110397
2013-01-05    0.000000
2013-01-06    0.943486
Freq: D, dtype: float64


In [32]:
# Aplicar operación.
# np.cumsum() hace una suma acumulada.
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F,G
2013-01-01,0.0,,0.0,0.340801,1.0,0.0
2013-01-02,0.900871,0.312723,0.0,0.340801,2.8,0.900871
2013-01-03,0.900871,0.312723,0.32088,2.044647,,1.221751
2013-01-04,1.176865,0.312723,0.32088,2.044647,,1.497745
2013-01-05,1.176865,0.312723,0.32088,2.044647,,1.497745
2013-01-06,1.176865,1.601391,2.035259,2.044647,,3.212124


In [33]:
# Se puede obtener un histograma.

s = pd.Series(np.random.randint(0, 7, size = 10))
print (s)


0    1
1    1
2    4
3    5
4    5
5    1
6    1
7    0
8    0
9    6
dtype: int64


In [34]:
print(s.value_counts())

1    4
5    2
0    2
4    1
6    1
dtype: int64


In [35]:
# Agrupar datos

fechas = pd.date_range("20130101", periods = 200, freq = "D")
df = pd.DataFrame( np.random.randn(200, 4), index = fechas,
    columns = ["A", "B", "C", "D"] )

print(df)

                   A         B         C         D
2013-01-01  0.708496  0.769959 -0.267129 -1.360763
2013-01-02  0.783382  0.295696 -0.085690  0.429883
2013-01-03  0.590373  0.420901 -0.595486 -1.002284
2013-01-04  0.519364  0.585781  1.501747 -0.619531
2013-01-05 -0.217893 -0.769042 -0.331876  1.173758
...              ...       ...       ...       ...
2013-07-15  0.194318 -1.387961 -0.360975 -1.589907
2013-07-16 -0.300580  0.405295  0.755332  0.723973
2013-07-17 -1.414695  0.646161 -1.631316 -0.551761
2013-07-18 -0.100474  1.121226 -1.011722 -0.745656
2013-07-19 -0.093251  0.169300 -1.333324 -0.295179

[200 rows x 4 columns]


In [36]:
# Agrupamos y realizamos una operación.
df.groupby(df.index.month).sum()

Unnamed: 0,A,B,C,D
1,3.829662,-1.909864,7.531364,-7.410341
2,2.445226,-3.60535,-3.073325,7.861396
3,2.303539,-7.676941,4.28195,4.680042
4,-3.803042,1.92327,-0.754348,-3.807958
5,1.658795,-6.067036,6.651172,-1.002043
6,0.627518,3.087443,-3.335207,-7.023783
7,-3.248636,0.928429,-5.516293,-4.706233


In [37]:
df.groupby(df.index.day).mean()

Unnamed: 0,A,B,C,D
1,0.062025,-0.810052,0.036109,-0.787425
2,0.090926,-0.328685,-0.13217,-0.057975
3,0.414145,0.190042,-0.250442,-0.244959
4,0.444868,0.069236,0.408478,-0.317428
5,-0.523164,-0.0912,-0.53708,0.323205
6,0.127317,-0.874091,-0.199101,0.155689
7,-0.033239,0.096372,0.232898,0.4315
8,0.056326,-0.313328,0.342732,0.318079
9,0.193156,0.502172,-0.252693,-0.265346
10,0.201453,-0.465677,-0.028036,0.364933


In [38]:
# Hay mucha flexibilidad en el manejo de las fechas para series de tiempo.

rango = pd.date_range("1/1/2012", periods = 200, freq = "S")
ts = pd.Series(np.random.randint( 0, 500, len(rango) ), index = rango)

print(ts)

2012-01-01 00:00:00    439
2012-01-01 00:00:01     29
2012-01-01 00:00:02     39
2012-01-01 00:00:03    477
2012-01-01 00:00:04    227
                      ... 
2012-01-01 00:03:15    243
2012-01-01 00:03:16     62
2012-01-01 00:03:17    239
2012-01-01 00:03:18    465
2012-01-01 00:03:19    226
Freq: S, Length: 200, dtype: int64


In [39]:
# Hacemos un resampleo.

print( ts.resample("1Min").sum() )

2012-01-01 00:00:00    15309
2012-01-01 00:01:00    13791
2012-01-01 00:02:00    15216
2012-01-01 00:03:00     3856
Freq: T, dtype: int64


In [41]:
# Leer y escribir archivos es muy fácil con 
# el formato csv, compatible con Excel.

# Escribir.
ts.name = "Datos"
ts.index.name = "Timestamp"
ts.to_csv("time_series.csv")


In [44]:
# Leer archivo.

ts_2 = pd.read_csv("time_series.csv", index_col = "Timestamp")

print(ts_2)

               Datos
Timestamp           
01/01/12 0:00    439
01/01/12 0:00     29
01/01/12 0:00     39
01/01/12 0:00    477
01/01/12 0:00    227
...              ...
01/01/12 0:03    243
01/01/12 0:03     62
01/01/12 0:03    239
01/01/12 0:03    465
01/01/12 0:03    226

[200 rows x 1 columns]


In [46]:
print( ts_2.index )

Index(['01/01/12 0:00', '01/01/12 0:00', '01/01/12 0:00', '01/01/12 0:00',
       '01/01/12 0:00', '01/01/12 0:00', '01/01/12 0:00', '01/01/12 0:00',
       '01/01/12 0:00', '01/01/12 0:00',
       ...
       '01/01/12 0:03', '01/01/12 0:03', '01/01/12 0:03', '01/01/12 0:03',
       '01/01/12 0:03', '01/01/12 0:03', '01/01/12 0:03', '01/01/12 0:03',
       '01/01/12 0:03', '01/01/12 0:03'],
      dtype='object', name='Timestamp', length=200)


In [50]:
ts_2.index = pd.to_datetime( ts_2.index )

print(ts_2.index)

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:00',
               '2012-01-01 00:00:00', '2012-01-01 00:00:00',
               '2012-01-01 00:00:00', '2012-01-01 00:00:00',
               '2012-01-01 00:00:00', '2012-01-01 00:00:00',
               '2012-01-01 00:00:00', '2012-01-01 00:00:00',
               ...
               '2012-01-01 00:03:00', '2012-01-01 00:03:00',
               '2012-01-01 00:03:00', '2012-01-01 00:03:00',
               '2012-01-01 00:03:00', '2012-01-01 00:03:00',
               '2012-01-01 00:03:00', '2012-01-01 00:03:00',
               '2012-01-01 00:03:00', '2012-01-01 00:03:00'],
              dtype='datetime64[ns]', name='Timestamp', length=200, freq=None)


In [None]:
# Cargar el archivo Electrico_residencial_2010_2017
# Selecciona las entradas correspondientes a Baja California.
# Ordena los datos de menor a mayor consumo en 2010.
# Selecciona los municipios con una cantidad de usiarios en 2012 menor a 100
# Encuentra el municipio con mayor consumo en tarifa DAC.