In [76]:
!pip install pandas



In [77]:
# Pandas es el Excel de python.
# Obtenido de: https://pandas.pydata.org/docs/user_guide/10min.html

import pandas as pd
import numpy as np

In [78]:
# Una columna en pandas se llama una serie
# np.nan es un valor faltante
# Cada elemento tiene un índice.

s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [79]:
# El índice puede ser una fecha

fechas = pd.date_range("20130101", periods = 6, freq = "D")

print(fechas)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


In [80]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index = fechas, name = "datos")

print(s)

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, Name: datos, dtype: float64


In [81]:
# Un dataframe es un conjunto de columnas.

# creado a partir de arreglos de numpy.
df = pd.DataFrame( np.random.randn(6, 4), index = fechas,
    columns = ["A", "B", "C", "D"] )

print(df)

                   A         B         C         D
2013-01-01  0.405006  0.748404  0.244871 -1.181110
2013-01-02 -1.817821 -0.288873 -0.719807  0.170888
2013-01-03  1.075171 -0.969841 -0.120263  1.199907
2013-01-04  0.393199 -0.174549 -0.809589  1.432332
2013-01-05  0.210182  0.855749 -0.382265  0.634273
2013-01-06 -1.629181  0.510364  1.159254  0.563412


In [82]:
# head() y tail() muestran la parte 
# superior e inferior del Dataframe.

print(df.head())
print()
print(df.tail())

                   A         B         C         D
2013-01-01  0.405006  0.748404  0.244871 -1.181110
2013-01-02 -1.817821 -0.288873 -0.719807  0.170888
2013-01-03  1.075171 -0.969841 -0.120263  1.199907
2013-01-04  0.393199 -0.174549 -0.809589  1.432332
2013-01-05  0.210182  0.855749 -0.382265  0.634273

                   A         B         C         D
2013-01-02 -1.817821 -0.288873 -0.719807  0.170888
2013-01-03  1.075171 -0.969841 -0.120263  1.199907
2013-01-04  0.393199 -0.174549 -0.809589  1.432332
2013-01-05  0.210182  0.855749 -0.382265  0.634273
2013-01-06 -1.629181  0.510364  1.159254  0.563412


In [83]:
# Podemos acceder al índice y las columnas.

print(df.index)
print()
print(df.columns)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

Index(['A', 'B', 'C', 'D'], dtype='object')


In [84]:
# Trasponer.

print(df.T)

   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A    0.405006   -1.817821    1.075171    0.393199    0.210182   -1.629181
B    0.748404   -0.288873   -0.969841   -0.174549    0.855749    0.510364
C    0.244871   -0.719807   -0.120263   -0.809589   -0.382265    1.159254
D   -1.181110    0.170888    1.199907    1.432332    0.634273    0.563412


In [85]:
# Convertir a un arreglo de numpy.

print( df.to_numpy() )

[[ 0.40500581  0.748404    0.24487114 -1.18111028]
 [-1.81782124 -0.28887308 -0.71980652  0.17088785]
 [ 1.0751714  -0.9698415  -0.12026286  1.19990699]
 [ 0.39319856 -0.17454856 -0.80958924  1.43233223]
 [ 0.21018161  0.85574876 -0.38226455  0.63427349]
 [-1.62918135  0.51036378  1.15925417  0.56341246]]


In [86]:
# Estadística descriptiva.

print( df.describe() )

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.227241  0.113542 -0.104633  0.469950
std    1.197304  0.711343  0.731066  0.928460
min   -1.817821 -0.969841 -0.809589 -1.181110
25%   -1.169341 -0.260292 -0.635421  0.269019
50%    0.301690  0.167908 -0.251264  0.598843
75%    0.402054  0.688894  0.153588  1.058499
max    1.075171  0.855749  1.159254  1.432332


In [87]:
# Ordenar datos.

# Por índice.

print( df.sort_index(ascending = False) )

                   A         B         C         D
2013-01-06 -1.629181  0.510364  1.159254  0.563412
2013-01-05  0.210182  0.855749 -0.382265  0.634273
2013-01-04  0.393199 -0.174549 -0.809589  1.432332
2013-01-03  1.075171 -0.969841 -0.120263  1.199907
2013-01-02 -1.817821 -0.288873 -0.719807  0.170888
2013-01-01  0.405006  0.748404  0.244871 -1.181110


In [88]:
# Por columna.
print( df.sort_values(by = "B") )

                   A         B         C         D
2013-01-03  1.075171 -0.969841 -0.120263  1.199907
2013-01-02 -1.817821 -0.288873 -0.719807  0.170888
2013-01-04  0.393199 -0.174549 -0.809589  1.432332
2013-01-06 -1.629181  0.510364  1.159254  0.563412
2013-01-01  0.405006  0.748404  0.244871 -1.181110
2013-01-05  0.210182  0.855749 -0.382265  0.634273


In [89]:
# Escoger una columna.

print( df["A"] )

2013-01-01    0.405006
2013-01-02   -1.817821
2013-01-03    1.075171
2013-01-04    0.393199
2013-01-05    0.210182
2013-01-06   -1.629181
Freq: D, Name: A, dtype: float64


In [90]:
# Escoger una fila.

print(df.index[0])
print( df.loc[df.index[0]] )

2013-01-01 00:00:00
A    0.405006
B    0.748404
C    0.244871
D   -1.181110
Name: 2013-01-01 00:00:00, dtype: float64


In [91]:
# Escoger por número de fila.

print( df.iloc[0] )

A    0.405006
B    0.748404
C    0.244871
D   -1.181110
Name: 2013-01-01 00:00:00, dtype: float64


In [92]:
# Escoger por fila y columna.

print( df.loc[df.index[2], "B"] )

-0.9698414969728085


In [93]:
# Escoger por número de fila y columna.

print( df.iloc[2, 1] )

-0.9698414969728085


In [94]:
# Escoger una subsección del dataframe.

df_2 = df.iloc[2:4, 1:3]
print( df_2 )

                   B         C
2013-01-03 -0.969841 -0.120263
2013-01-04 -0.174549 -0.809589


In [95]:
df_2 = df.iloc[2:4, :]
print( df_2 )

# Las copias y vistas funcionan igual que en numpy!!

                   A         B         C         D
2013-01-03  1.075171 -0.969841 -0.120263  1.199907
2013-01-04  0.393199 -0.174549 -0.809589  1.432332


In [96]:
# Búsqueda por condiciones.

# En una columna.
print( df[df["A"] > 0] )

                   A         B         C         D
2013-01-01  0.405006  0.748404  0.244871 -1.181110
2013-01-03  1.075171 -0.969841 -0.120263  1.199907
2013-01-04  0.393199 -0.174549 -0.809589  1.432332
2013-01-05  0.210182  0.855749 -0.382265  0.634273


In [97]:
# En todo el dataframe.

print( df[df>0] )

                   A         B         C         D
2013-01-01  0.405006  0.748404  0.244871       NaN
2013-01-02       NaN       NaN       NaN  0.170888
2013-01-03  1.075171       NaN       NaN  1.199907
2013-01-04  0.393199       NaN       NaN  1.432332
2013-01-05  0.210182  0.855749       NaN  0.634273
2013-01-06       NaN  0.510364  1.159254  0.563412


In [98]:
# Agregar columnas.
# Todos los elementos de una columna son del mismo tipo.
# Las columnas pueden tener distintos tipos.

df["E"] = ["uno", "uno", "dos", "tres", "cuatro", "tres"]

print(df)

                   A         B         C         D       E
2013-01-01  0.405006  0.748404  0.244871 -1.181110     uno
2013-01-02 -1.817821 -0.288873 -0.719807  0.170888     uno
2013-01-03  1.075171 -0.969841 -0.120263  1.199907     dos
2013-01-04  0.393199 -0.174549 -0.809589  1.432332    tres
2013-01-05  0.210182  0.855749 -0.382265  0.634273  cuatro
2013-01-06 -1.629181  0.510364  1.159254  0.563412    tres


In [99]:
# Buscar un valor específico dentro de una columna.

df[ df["E"] == "uno" ]

Unnamed: 0,A,B,C,D,E
2013-01-01,0.405006,0.748404,0.244871,-1.18111,uno
2013-01-02,-1.817821,-0.288873,-0.719807,0.170888,uno


In [100]:
# Nueva columna con arreglos.

df["F"] = np.linspace(1, 5, 6)
print(df.head())

                   A         B         C         D       E    F
2013-01-01  0.405006  0.748404  0.244871 -1.181110     uno  1.0
2013-01-02 -1.817821 -0.288873 -0.719807  0.170888     uno  1.8
2013-01-03  1.075171 -0.969841 -0.120263  1.199907     dos  2.6
2013-01-04  0.393199 -0.174549 -0.809589  1.432332    tres  3.4
2013-01-05  0.210182  0.855749 -0.382265  0.634273  cuatro  4.2


In [101]:
# Buscar valores específicos con isin()

print( df[ df["E"].isin( ["dos", "cuatro"] ) ] )

                   A         B         C         D       E    F
2013-01-03  1.075171 -0.969841 -0.120263  1.199907     dos  2.6
2013-01-05  0.210182  0.855749 -0.382265  0.634273  cuatro  4.2


In [102]:
df[ df["F"].isin( [1, 5] ) ]

Unnamed: 0,A,B,C,D,E,F
2013-01-01,0.405006,0.748404,0.244871,-1.18111,uno,1.0
2013-01-06,-1.629181,0.510364,1.159254,0.563412,tres,5.0


In [103]:
# Podemos cambiar un valor específico.

df.iloc[0, 1] = 10
print(df)

                   A          B         C         D       E    F
2013-01-01  0.405006  10.000000  0.244871 -1.181110     uno  1.0
2013-01-02 -1.817821  -0.288873 -0.719807  0.170888     uno  1.8
2013-01-03  1.075171  -0.969841 -0.120263  1.199907     dos  2.6
2013-01-04  0.393199  -0.174549 -0.809589  1.432332    tres  3.4
2013-01-05  0.210182   0.855749 -0.382265  0.634273  cuatro  4.2
2013-01-06 -1.629181   0.510364  1.159254  0.563412    tres  5.0


In [104]:
# Quitar columna.

df = df.drop("E", axis = 1)
print(df)

                   A          B         C         D    F
2013-01-01  0.405006  10.000000  0.244871 -1.181110  1.0
2013-01-02 -1.817821  -0.288873 -0.719807  0.170888  1.8
2013-01-03  1.075171  -0.969841 -0.120263  1.199907  2.6
2013-01-04  0.393199  -0.174549 -0.809589  1.432332  3.4
2013-01-05  0.210182   0.855749 -0.382265  0.634273  4.2
2013-01-06 -1.629181   0.510364  1.159254  0.563412  5.0


In [105]:
# Quitar filas
df = df.drop(df.index[3], axis = 0)

df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.405006,10.0,0.244871,-1.18111,1.0
2013-01-02,-1.817821,-0.288873,-0.719807,0.170888,1.8
2013-01-03,1.075171,-0.969841,-0.120263,1.199907,2.6
2013-01-05,0.210182,0.855749,-0.382265,0.634273,4.2
2013-01-06,-1.629181,0.510364,1.159254,0.563412,5.0


In [106]:
# Establecer valores con condicional.

df[df<0] = 0
print(df)

                   A          B         C         D    F
2013-01-01  0.405006  10.000000  0.244871  0.000000  1.0
2013-01-02  0.000000   0.000000  0.000000  0.170888  1.8
2013-01-03  1.075171   0.000000  0.000000  1.199907  2.6
2013-01-05  0.210182   0.855749  0.000000  0.634273  4.2
2013-01-06  0.000000   0.510364  1.159254  0.563412  5.0


In [107]:
# Agregamos algunos valores nulos.

df[df>2] = np.nan
print(df)

                   A         B         C         D    F
2013-01-01  0.405006       NaN  0.244871  0.000000  1.0
2013-01-02  0.000000  0.000000  0.000000  0.170888  1.8
2013-01-03  1.075171  0.000000  0.000000  1.199907  NaN
2013-01-05  0.210182  0.855749  0.000000  0.634273  NaN
2013-01-06  0.000000  0.510364  1.159254  0.563412  NaN


In [108]:
# Quitar filas con valores nulos.
print(df.dropna())
print()

# Rellenar valores nulos.
print(df.fillna(-100))
print()

# Determinar valores nulos.
print(df.isna())

              A    B    C         D    F
2013-01-02  0.0  0.0  0.0  0.170888  1.8

                   A           B         C         D      F
2013-01-01  0.405006 -100.000000  0.244871  0.000000    1.0
2013-01-02  0.000000    0.000000  0.000000  0.170888    1.8
2013-01-03  1.075171    0.000000  0.000000  1.199907 -100.0
2013-01-05  0.210182    0.855749  0.000000  0.634273 -100.0
2013-01-06  0.000000    0.510364  1.159254  0.563412 -100.0

                A      B      C      D      F
2013-01-01  False   True  False  False  False
2013-01-02  False  False  False  False  False
2013-01-03  False  False  False  False   True
2013-01-05  False  False  False  False   True
2013-01-06  False  False  False  False   True


In [109]:
# operaciones.

# Entre columnas.
df["G"] = df["A"] + df["C"]

print(df)
print()

# promedio
print(df.mean())
print()
print(df.mean(1))

                   A         B         C         D    F         G
2013-01-01  0.405006       NaN  0.244871  0.000000  1.0  0.649877
2013-01-02  0.000000  0.000000  0.000000  0.170888  1.8  0.000000
2013-01-03  1.075171  0.000000  0.000000  1.199907  NaN  1.075171
2013-01-05  0.210182  0.855749  0.000000  0.634273  NaN  0.210182
2013-01-06  0.000000  0.510364  1.159254  0.563412  NaN  1.159254

A    0.338072
B    0.341528
C    0.280825
D    0.513696
F    1.400000
G    0.618897
dtype: float64

2013-01-01    0.459951
2013-01-02    0.328481
2013-01-03    0.670050
2013-01-05    0.382077
2013-01-06    0.678457
dtype: float64


In [110]:
# Aplicar operación.
# np.cumsum() hace una suma acumulada.
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F,G
2013-01-01,0.405006,,0.244871,0.0,1.0,0.649877
2013-01-02,0.405006,0.0,0.244871,0.170888,2.8,0.649877
2013-01-03,1.480177,0.0,0.244871,1.370795,,1.725048
2013-01-05,1.690359,0.855749,0.244871,2.005068,,1.93523
2013-01-06,1.690359,1.366113,1.404125,2.568481,,3.094484


In [111]:
# Logaritmo a cada elemento.
df["H"] = np.log( df["C"] )

df

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,A,B,C,D,F,G,H
2013-01-01,0.405006,,0.244871,0.0,1.0,0.649877,-1.407023
2013-01-02,0.0,0.0,0.0,0.170888,1.8,0.0,-inf
2013-01-03,1.075171,0.0,0.0,1.199907,,1.075171,-inf
2013-01-05,0.210182,0.855749,0.0,0.634273,,0.210182,-inf
2013-01-06,0.0,0.510364,1.159254,0.563412,,1.159254,0.147777


In [112]:
# Recorrer una columna una cierta cantidad de filas.

df["I"] = df["C"].shift(2)

df

Unnamed: 0,A,B,C,D,F,G,H,I
2013-01-01,0.405006,,0.244871,0.0,1.0,0.649877,-1.407023,
2013-01-02,0.0,0.0,0.0,0.170888,1.8,0.0,-inf,
2013-01-03,1.075171,0.0,0.0,1.199907,,1.075171,-inf,0.244871
2013-01-05,0.210182,0.855749,0.0,0.634273,,0.210182,-inf,0.0
2013-01-06,0.0,0.510364,1.159254,0.563412,,1.159254,0.147777,0.0


In [113]:
# Leer y escribir archivos es muy fácil con el formato
# csv, compatible con Excel.

# Escribir.
df.to_csv("Archivo1.csv")

In [114]:
# Leer archivo.

df_2 = pd.read_csv("Archivo1.csv", index_col = 0)
df_2

Unnamed: 0,A,B,C,D,F,G,H,I
2013-01-01,0.405006,,0.244871,0.0,1.0,0.649877,-1.407023,
2013-01-02,0.0,0.0,0.0,0.170888,1.8,0.0,-inf,
2013-01-03,1.075171,0.0,0.0,1.199907,,1.075171,-inf,0.244871
2013-01-05,0.210182,0.855749,0.0,0.634273,,0.210182,-inf,0.0
2013-01-06,0.0,0.510364,1.159254,0.563412,,1.159254,0.147777,0.0


In [115]:
# El índice de fechas no se carga con el formato correcto.
print( df.index )
print( df_2.index )

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-05',
               '2013-01-06'],
              dtype='datetime64[ns]', freq=None)
Index(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-05', '2013-01-06'], dtype='object')


In [116]:
# Cambiamos al formato correcto.
df_2.index = pd.to_datetime( df_2.index )
df_2.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-05',
               '2013-01-06'],
              dtype='datetime64[ns]', freq=None)

In [117]:
# Correlación.

# Creamos un dataframe aleatorio.
df = pd.DataFrame( np.random.rand(100000, 2), columns = ["Col_1", "Col_2"] )

print( df.head() )
print()

# Matriz de correlación
print( df.corr() )
print()

# Matriz de covarianca
print( df.cov() )
print()

# Autocorrelación.
print( df["Col_1"].autocorr(lag = 1) )

      Col_1     Col_2
0  0.983965  0.050127
1  0.906384  0.335550
2  0.214511  0.155169
3  0.813020  0.863430
4  0.578360  0.781036

        Col_1   Col_2
Col_1  1.0000  0.0015
Col_2  0.0015  1.0000

          Col_1     Col_2
Col_1  0.083342  0.000125
Col_2  0.000125  0.083669

0.00048464378817299474


In [118]:
# Tarea clase 3.
# 1. Cargar el archivo Electrico_residencial_2010_2017.
# 2. Selecciona las entradas correspondientes a Baja California.
# 3. Ordena los datos de menor a mayor consumo en 2010.
# 4. Selecciona los municipios con una cantidad
# de usuarios en 2012 menor a 100
# 5. Selecciona solo la tarifa DAC para todos los municipios.
# Encuentra el municipio con mayor consumo en tarifa DAC.
# 6. Encuentra la correlación entre el consumo de 2012 y 2013.