In [2]:
# Pandas es el Excel de python.
# Obtenido de: https://pandas.pydata.org/docs/user_guide/10min.html

import pandas as pd
import numpy as np

In [3]:
# Una columna en pandas se llama una serie
# np.nan es un valor faltante
# Cada elemento tiene un índice.

s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [4]:
# El índice puede ser una fecha

fechas = pd.date_range("20130101", periods = 6, freq = "D")

print(fechas)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


In [5]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index = fechas, name = "datos")

print(s)

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, Name: datos, dtype: float64


In [6]:
# Un dataframe es un conjunto de columnas.

# creado a partir de series.
df = pd.DataFrame( np.random.randn(6, 4), index = fechas,
    columns = ["A", "B", "C", "D"] )

print(df)

                   A         B         C         D
2013-01-01 -0.688920  0.706461  0.038464 -0.557300
2013-01-02  0.024111  2.587698  1.121242  0.468241
2013-01-03 -1.350958  0.550008  0.007322  0.723728
2013-01-04 -0.861917 -0.803071  0.453286  1.802179
2013-01-05  1.007735  0.759993 -1.356845  0.107955
2013-01-06  1.684444  2.478518 -0.120551 -1.314997


In [7]:
# head() y tail() muestran la parte 
# superior e inferior del Dataframe.

print(df.head())
print(df.tail())

                   A         B         C         D
2013-01-01 -0.688920  0.706461  0.038464 -0.557300
2013-01-02  0.024111  2.587698  1.121242  0.468241
2013-01-03 -1.350958  0.550008  0.007322  0.723728
2013-01-04 -0.861917 -0.803071  0.453286  1.802179
2013-01-05  1.007735  0.759993 -1.356845  0.107955
                   A         B         C         D
2013-01-02  0.024111  2.587698  1.121242  0.468241
2013-01-03 -1.350958  0.550008  0.007322  0.723728
2013-01-04 -0.861917 -0.803071  0.453286  1.802179
2013-01-05  1.007735  0.759993 -1.356845  0.107955
2013-01-06  1.684444  2.478518 -0.120551 -1.314997


In [8]:
# Podemos acceder al índice y las columnas.

print(df.index)
print(df.columns)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


In [9]:
# Trasponer.

print(df.T)

   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A   -0.688920    0.024111   -1.350958   -0.861917    1.007735    1.684444
B    0.706461    2.587698    0.550008   -0.803071    0.759993    2.478518
C    0.038464    1.121242    0.007322    0.453286   -1.356845   -0.120551
D   -0.557300    0.468241    0.723728    1.802179    0.107955   -1.314997


In [10]:
# Convertir a un arreglo de numpy.

print( df.to_numpy() )

[[-0.68891986  0.70646124  0.03846365 -0.55730041]
 [ 0.02411094  2.58769818  1.12124162  0.46824078]
 [-1.35095843  0.55000789  0.00732162  0.72372777]
 [-0.86191674 -0.80307105  0.45328616  1.80217899]
 [ 1.00773544  0.75999268 -1.3568453   0.1079546 ]
 [ 1.68444441  2.47851801 -0.12055138 -1.31499682]]


In [11]:
# Estadística descriptiva.

print( df.describe() )

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.030917  1.046601  0.023819  0.204967
std    1.173911  1.287719  0.814413  1.076129
min   -1.350958 -0.803071 -1.356845 -1.314997
25%   -0.818668  0.589121 -0.088583 -0.390987
50%   -0.332404  0.733227  0.022893  0.288098
75%    0.761829  2.048887  0.349581  0.659856
max    1.684444  2.587698  1.121242  1.802179


In [12]:
# Ordenar datos.

# Por índice.

print( df.sort_index(ascending = False) )

                   A         B         C         D
2013-01-06  1.684444  2.478518 -0.120551 -1.314997
2013-01-05  1.007735  0.759993 -1.356845  0.107955
2013-01-04 -0.861917 -0.803071  0.453286  1.802179
2013-01-03 -1.350958  0.550008  0.007322  0.723728
2013-01-02  0.024111  2.587698  1.121242  0.468241
2013-01-01 -0.688920  0.706461  0.038464 -0.557300


In [13]:
# Por columna.
print( df.sort_values(by = "B") )

                   A         B         C         D
2013-01-04 -0.861917 -0.803071  0.453286  1.802179
2013-01-03 -1.350958  0.550008  0.007322  0.723728
2013-01-01 -0.688920  0.706461  0.038464 -0.557300
2013-01-05  1.007735  0.759993 -1.356845  0.107955
2013-01-06  1.684444  2.478518 -0.120551 -1.314997
2013-01-02  0.024111  2.587698  1.121242  0.468241


In [14]:
# Escoger una columna.

print( df["A"] )

2013-01-01   -0.688920
2013-01-02    0.024111
2013-01-03   -1.350958
2013-01-04   -0.861917
2013-01-05    1.007735
2013-01-06    1.684444
Freq: D, Name: A, dtype: float64


In [15]:
# Escoger una fila.

print( df.loc[df.index[0]] )

A   -0.688920
B    0.706461
C    0.038464
D   -0.557300
Name: 2013-01-01 00:00:00, dtype: float64


In [16]:
# Escoger por número de fila.

print( df.iloc[0] )

A   -0.688920
B    0.706461
C    0.038464
D   -0.557300
Name: 2013-01-01 00:00:00, dtype: float64


In [17]:
# Escoger por fila y columna.

print( df.loc[df.index[2], "B"] )

0.5500078887549639


In [18]:
# Escoger por número de fila y columna.

print( df.iloc[2, 1] )

0.5500078887549639


In [19]:
# Escoger una subsección del dataframe.

df_2 = df.iloc[2:4, 1:3]
print( df_2 )

                   B         C
2013-01-03  0.550008  0.007322
2013-01-04 -0.803071  0.453286


In [20]:
df_2 = df.iloc[2:4, :]
print( df_2 )

# Las copias y vistas funcionan igual que en numpy!!

                   A         B         C         D
2013-01-03 -1.350958  0.550008  0.007322  0.723728
2013-01-04 -0.861917 -0.803071  0.453286  1.802179


In [21]:
# Búsqueda por condiciones.

# En una columna.
print( df[df["A"] > 0] )

                   A         B         C         D
2013-01-02  0.024111  2.587698  1.121242  0.468241
2013-01-05  1.007735  0.759993 -1.356845  0.107955
2013-01-06  1.684444  2.478518 -0.120551 -1.314997


In [22]:
# En todo el dataframe.

print( df[df>0] )

                   A         B         C         D
2013-01-01       NaN  0.706461  0.038464       NaN
2013-01-02  0.024111  2.587698  1.121242  0.468241
2013-01-03       NaN  0.550008  0.007322  0.723728
2013-01-04       NaN       NaN  0.453286  1.802179
2013-01-05  1.007735  0.759993       NaN  0.107955
2013-01-06  1.684444  2.478518       NaN       NaN


In [23]:
# Agregar columnas.
# Todos los elementos de una columna son del mismo tipo.
# Las columnas pueden tener distintos tipos.

df["E"] = ["uno", "uno", "dos", "tres", "cuatro", "tres"]

print(df)

                   A         B         C         D       E
2013-01-01 -0.688920  0.706461  0.038464 -0.557300     uno
2013-01-02  0.024111  2.587698  1.121242  0.468241     uno
2013-01-03 -1.350958  0.550008  0.007322  0.723728     dos
2013-01-04 -0.861917 -0.803071  0.453286  1.802179    tres
2013-01-05  1.007735  0.759993 -1.356845  0.107955  cuatro
2013-01-06  1.684444  2.478518 -0.120551 -1.314997    tres


In [24]:
# Nueva columna con arreglos.

df["F"] = np.linspace(1, 5, 6)
print(df.head())

                   A         B         C         D       E    F
2013-01-01 -0.688920  0.706461  0.038464 -0.557300     uno  1.0
2013-01-02  0.024111  2.587698  1.121242  0.468241     uno  1.8
2013-01-03 -1.350958  0.550008  0.007322  0.723728     dos  2.6
2013-01-04 -0.861917 -0.803071  0.453286  1.802179    tres  3.4
2013-01-05  1.007735  0.759993 -1.356845  0.107955  cuatro  4.2


In [25]:
# Buscar valores específicos con isin()

print( df[df["E"].isin(["dos", "cuatro"])] )

                   A         B         C         D       E    F
2013-01-03 -1.350958  0.550008  0.007322  0.723728     dos  2.6
2013-01-05  1.007735  0.759993 -1.356845  0.107955  cuatro  4.2


In [26]:
# Podemos cambiar un valor específico.

df.iloc[0, 1] = 10
print(df)

                   A          B         C         D       E    F
2013-01-01 -0.688920  10.000000  0.038464 -0.557300     uno  1.0
2013-01-02  0.024111   2.587698  1.121242  0.468241     uno  1.8
2013-01-03 -1.350958   0.550008  0.007322  0.723728     dos  2.6
2013-01-04 -0.861917  -0.803071  0.453286  1.802179    tres  3.4
2013-01-05  1.007735   0.759993 -1.356845  0.107955  cuatro  4.2
2013-01-06  1.684444   2.478518 -0.120551 -1.314997    tres  5.0


In [27]:
# Quitar columna.

df = df.drop("E", axis = 1)
print(df)

                   A          B         C         D    F
2013-01-01 -0.688920  10.000000  0.038464 -0.557300  1.0
2013-01-02  0.024111   2.587698  1.121242  0.468241  1.8
2013-01-03 -1.350958   0.550008  0.007322  0.723728  2.6
2013-01-04 -0.861917  -0.803071  0.453286  1.802179  3.4
2013-01-05  1.007735   0.759993 -1.356845  0.107955  4.2
2013-01-06  1.684444   2.478518 -0.120551 -1.314997  5.0


In [28]:
# Establecer valores con condicional.

df[df<0] = 0
print(df)

                   A          B         C         D    F
2013-01-01  0.000000  10.000000  0.038464  0.000000  1.0
2013-01-02  0.024111   2.587698  1.121242  0.468241  1.8
2013-01-03  0.000000   0.550008  0.007322  0.723728  2.6
2013-01-04  0.000000   0.000000  0.453286  1.802179  3.4
2013-01-05  1.007735   0.759993  0.000000  0.107955  4.2
2013-01-06  1.684444   2.478518  0.000000  0.000000  5.0


In [29]:
# Agregamos algunos valores nulos.

df[df>2] = np.nan
print(df)

                   A         B         C         D    F
2013-01-01  0.000000       NaN  0.038464  0.000000  1.0
2013-01-02  0.024111       NaN  1.121242  0.468241  1.8
2013-01-03  0.000000  0.550008  0.007322  0.723728  NaN
2013-01-04  0.000000  0.000000  0.453286  1.802179  NaN
2013-01-05  1.007735  0.759993  0.000000  0.107955  NaN
2013-01-06  1.684444       NaN  0.000000  0.000000  NaN


In [30]:
# Quitar filas con valores nulos.
print(df.dropna())

# Rellenar valores nulos.
print(df.fillna(-100))

# Determinar valores nulos.
print(df.isna())

Empty DataFrame
Columns: [A, B, C, D, F]
Index: []
                   A           B         C         D      F
2013-01-01  0.000000 -100.000000  0.038464  0.000000    1.0
2013-01-02  0.024111 -100.000000  1.121242  0.468241    1.8
2013-01-03  0.000000    0.550008  0.007322  0.723728 -100.0
2013-01-04  0.000000    0.000000  0.453286  1.802179 -100.0
2013-01-05  1.007735    0.759993  0.000000  0.107955 -100.0
2013-01-06  1.684444 -100.000000  0.000000  0.000000 -100.0
                A      B      C      D      F
2013-01-01  False   True  False  False  False
2013-01-02  False   True  False  False  False
2013-01-03  False  False  False  False   True
2013-01-04  False  False  False  False   True
2013-01-05  False  False  False  False   True
2013-01-06  False   True  False  False   True


In [31]:
# operaciones.

# Entre columnas.
df["G"] = df["A"] + df["C"]

print(df)

# promedio
print(df.mean())
print(df.mean(1))

                   A         B         C         D    F         G
2013-01-01  0.000000       NaN  0.038464  0.000000  1.0  0.038464
2013-01-02  0.024111       NaN  1.121242  0.468241  1.8  1.145353
2013-01-03  0.000000  0.550008  0.007322  0.723728  NaN  0.007322
2013-01-04  0.000000  0.000000  0.453286  1.802179  NaN  0.453286
2013-01-05  1.007735  0.759993  0.000000  0.107955  NaN  1.007735
2013-01-06  1.684444       NaN  0.000000  0.000000  NaN  1.684444
A    0.452715
B    0.436667
C    0.270052
D    0.517017
F    1.400000
G    0.722767
dtype: float64
2013-01-01    0.215385
2013-01-02    0.911789
2013-01-03    0.257676
2013-01-04    0.541750
2013-01-05    0.576684
2013-01-06    0.842222
Freq: D, dtype: float64


In [32]:
# Aplicar operación.
# np.cumsum() hace una suma acumulada.
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F,G
2013-01-01,0.0,,0.038464,0.0,1.0,0.038464
2013-01-02,0.024111,,1.159705,0.468241,2.8,1.183816
2013-01-03,0.024111,0.550008,1.167027,1.191969,,1.191138
2013-01-04,0.024111,0.550008,1.620313,2.994148,,1.644424
2013-01-05,1.031846,1.310001,1.620313,3.102102,,2.652159
2013-01-06,2.716291,,1.620313,3.102102,,4.336604


In [33]:
# Se puede obtener un histograma.

s = pd.Series(np.random.randint(0, 7, size = 10))
print (s)


0    1
1    4
2    1
3    1
4    6
5    6
6    0
7    3
8    5
9    5
dtype: int64


In [34]:
print(s.value_counts())

1    3
6    2
5    2
4    1
0    1
3    1
dtype: int64


In [35]:
# Agrupar datos

fechas = pd.date_range("20130101", periods = 200, freq = "D")
df = pd.DataFrame( np.random.randn(200, 4), index = fechas,
    columns = ["A", "B", "C", "D"] )

print(df)

                   A         B         C         D
2013-01-01 -0.236662  0.550306 -2.634410  0.671860
2013-01-02 -0.031187 -0.081049 -1.256877 -1.134545
2013-01-03  1.101908 -0.852672 -0.028781  0.402657
2013-01-04 -0.938619  1.108729 -1.428704 -0.321900
2013-01-05  1.889525  1.225235 -1.942665 -0.986916
...              ...       ...       ...       ...
2013-07-15  1.077508  0.737979  1.250247 -1.219586
2013-07-16 -0.854513  1.749456 -0.863483  0.199379
2013-07-17 -0.453848  0.161105 -0.332168 -0.046481
2013-07-18  1.747913 -1.275610  1.675633  0.669138
2013-07-19  0.507014 -0.384201  0.104155 -0.408654

[200 rows x 4 columns]


In [36]:
# Agrupamos y realizamos una operación.
df.groupby(df.index.month).sum()

Unnamed: 0,A,B,C,D
1,6.151396,3.865367,-6.672204,5.396473
2,-2.838412,4.191532,2.110218,-2.150074
3,5.743556,-6.034221,0.301843,-8.499961
4,4.611471,2.748181,4.848537,6.928591
5,4.257319,3.746627,0.833503,0.241398
6,-4.223978,5.506082,8.727856,3.162316
7,-3.358485,-2.444933,2.767849,0.238474


In [37]:
df.groupby(df.index.day).mean()

Unnamed: 0,A,B,C,D
1,-0.182482,-0.20509,0.246542,0.5237
2,-0.61086,0.052774,0.09521,-0.172404
3,-0.16743,0.390377,-0.106098,0.546969
4,0.082571,0.107658,-0.781847,-0.049781
5,0.602043,-0.046503,-0.033703,-0.179123
6,0.775882,-0.093834,0.355353,-0.128347
7,0.004927,-0.117809,0.550041,0.052623
8,-0.735894,-0.069245,0.241292,-0.136868
9,0.714511,-0.303218,0.239807,0.053803
10,-0.466276,0.200722,0.058272,-0.130807


In [38]:
# Hay mucha flexibilidad en el manejo de las fechas para series de tiempo.

rango = pd.date_range("1/1/2012", periods = 200, freq = "S")
ts = pd.Series(np.random.randint( 0, 500, len(rango) ), index = rango)

print(ts)

2012-01-01 00:00:00    317
2012-01-01 00:00:01    139
2012-01-01 00:00:02     23
2012-01-01 00:00:03    293
2012-01-01 00:00:04     94
                      ... 
2012-01-01 00:03:15      5
2012-01-01 00:03:16     27
2012-01-01 00:03:17    370
2012-01-01 00:03:18    494
2012-01-01 00:03:19    341
Freq: S, Length: 200, dtype: int64


In [39]:
# Hacemos un resampleo.

print( ts.resample("1Min").sum() )

2012-01-01 00:00:00    14728
2012-01-01 00:01:00    14415
2012-01-01 00:02:00    16713
2012-01-01 00:03:00     4854
Freq: T, dtype: int64


In [40]:
# Leer y escribir archivos es muy fácil con 
# el formato csv, compatible con Excel.

# Escribir.
ts.name = "Datos"
ts.index.name = "Timestamp"
ts.to_csv("time_series.csv")


In [41]:
# Leer archivo.

ts_2 = pd.read_csv("time_series.csv", index_col = "Timestamp")

print(ts_2)

                     Datos
Timestamp                 
2012-01-01 00:00:00    317
2012-01-01 00:00:01    139
2012-01-01 00:00:02     23
2012-01-01 00:00:03    293
2012-01-01 00:00:04     94
...                    ...
2012-01-01 00:03:15      5
2012-01-01 00:03:16     27
2012-01-01 00:03:17    370
2012-01-01 00:03:18    494
2012-01-01 00:03:19    341

[200 rows x 1 columns]


In [42]:
print( ts_2.index )

Index(['2012-01-01 00:00:00', '2012-01-01 00:00:01', '2012-01-01 00:00:02',
       '2012-01-01 00:00:03', '2012-01-01 00:00:04', '2012-01-01 00:00:05',
       '2012-01-01 00:00:06', '2012-01-01 00:00:07', '2012-01-01 00:00:08',
       '2012-01-01 00:00:09',
       ...
       '2012-01-01 00:03:10', '2012-01-01 00:03:11', '2012-01-01 00:03:12',
       '2012-01-01 00:03:13', '2012-01-01 00:03:14', '2012-01-01 00:03:15',
       '2012-01-01 00:03:16', '2012-01-01 00:03:17', '2012-01-01 00:03:18',
       '2012-01-01 00:03:19'],
      dtype='object', name='Timestamp', length=200)


In [43]:
ts_2.index = pd.to_datetime( ts_2.index )

print(ts_2.index)

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               ...
               '2012-01-01 00:03:10', '2012-01-01 00:03:11',
               '2012-01-01 00:03:12', '2012-01-01 00:03:13',
               '2012-01-01 00:03:14', '2012-01-01 00:03:15',
               '2012-01-01 00:03:16', '2012-01-01 00:03:17',
               '2012-01-01 00:03:18', '2012-01-01 00:03:19'],
              dtype='datetime64[ns]', name='Timestamp', length=200, freq=None)


In [44]:
# Correlación

# Creamos un dataframe aleatorio.
df = pd.DataFrame(np.random.randn(1000,2), columns = ["Col_1", "Col_2"])

print(df.head())
print()

# Matriz de correlación
print( df.corr() )
print()

# Matriz de covarianza
print( df.cov() )
print()

      Col_1     Col_2
0  0.664954  0.121131
1 -0.573261  0.600070
2  0.343497  0.512212
3  0.586276 -0.448339
4  1.929466  0.386596

          Col_1     Col_2
Col_1  1.000000  0.016645
Col_2  0.016645  1.000000

          Col_1     Col_2
Col_1  1.056700  0.016504
Col_2  0.016504  0.930372



In [51]:
# Concatenar dataframes.

# Creamos un dataframe aleatorio.
df_1 = pd.DataFrame(np.random.randn(100,2), columns = ["Col_1", "Col_2"])
df_2 = pd.DataFrame(np.random.randn(100,2), columns = ["Col_1", "Col_2"])

print(df_1.head())
print()
print(df_2.head())
print()

# Concatenamos.
df = pd.concat([df_1, df_2], axis = 0, ignore_index = True)

# Reiniciamos el índice.
df = df.reset_index(drop = True)

print(df)

      Col_1     Col_2
0 -1.516916 -0.383068
1  0.765016 -0.258222
2  0.393079  0.301042
3  1.090665 -0.262019
4 -0.679981 -0.723984

      Col_1     Col_2
0 -0.615278  0.847995
1  0.091538  0.136089
2 -0.019375 -0.174542
3 -0.351426  0.100549
4  0.930820  0.947435

        Col_1     Col_2
0   -1.516916 -0.383068
1    0.765016 -0.258222
2    0.393079  0.301042
3    1.090665 -0.262019
4   -0.679981 -0.723984
..        ...       ...
195  0.311306  0.782338
196  2.283423 -1.018743
197 -0.586801  1.134172
198 -1.741755  1.216342
199  1.203148 -0.254748

[200 rows x 2 columns]


In [52]:
# Concatenar dataframes.

# Creamos un dataframe aleatorio.
df_1 = pd.DataFrame(np.random.randn(100,2), columns = ["Col_1", "Col_2"])
df_2 = pd.DataFrame(np.random.randn(100,2), columns = ["Col_3", "Col_4"])

print(df_1.head())
print()
print(df_2.head())
print()

# Concatenamos.
df = pd.concat([df_1, df_2], axis = 1)

print(df)

      Col_1     Col_2
0 -0.047523 -1.654938
1 -1.057579 -0.166459
2  0.850526 -0.010457
3 -0.961197  1.165588
4 -0.019126 -0.642064

      Col_3     Col_4
0  0.678249  0.240590
1  0.485131 -0.217273
2  1.424911 -0.092623
3 -0.237672 -0.405159
4 -2.015896  0.817554

       Col_1     Col_2     Col_3     Col_4
0  -0.047523 -1.654938  0.678249  0.240590
1  -1.057579 -0.166459  0.485131 -0.217273
2   0.850526 -0.010457  1.424911 -0.092623
3  -0.961197  1.165588 -0.237672 -0.405159
4  -0.019126 -0.642064 -2.015896  0.817554
..       ...       ...       ...       ...
95 -1.076402 -1.029765  1.329433  0.693936
96 -0.727035 -0.824210 -1.585571  0.337386
97 -0.137586 -1.190115  0.616823  1.070981
98  0.643492 -0.113891  0.285760 -0.870517
99 -0.218098  1.349117 -0.861003  0.599605

[100 rows x 4 columns]


In [46]:
# Cargar el archivo Electrico_residencial_2010_2017
# Selecciona las entradas correspondientes a Baja California.
# Ordena los datos de menor a mayor consumo en 2010.
# Selecciona los municipios con una cantidad de usiarios en 2012 menor a 100
# Encuentra el municipio con mayor consumo en tarifa DAC.
# Selecciona solo la tarifa DAC.
# Encuentra la correlación entre el consumo de 2012 y 2013