# Pandas

In [1]:
import pandas as pd

## Series

In [None]:
help(pd.Series)

In [None]:
s = pd.Series([1, 2, 3, 4, 5, 6])
s

In [None]:
s = pd.Series({'math':10, 'leng':7, 'art': 2})
s

In [None]:
print(s.size)
print(s.index)
print(s.dtype)

In [None]:
print(s['math'])
print(s[1])
print(s[:2])
print(s[['leng','art']])


#### Resumen descriptivo

In [None]:
s = pd.Series([1, 2, 3, 4, 2, 5, 6])

print(s.count())
print(s.value_counts())
print(s.value_counts(normalize=True))
print(s.min())
print(s.max())
print(s.var())
print(s.std())

In [None]:
print(s.describe())

In [None]:
print(s.sum())
print(s.cumsum())

#### Operariones

In [None]:
s * 2
s % 2

In [None]:
st = pd.Series(['a', 'b', 'c'])
st*4

In [None]:
help(s.apply)

In [None]:
import math
import numpy as np
s.apply(np.sqrt)

In [None]:
st.apply(str.upper)

In [None]:
def mifunc(val):
    return str(val).upper()

s.apply(lambda x: x*2+3)

st.apply(mifunc)

#### Filtrar y ordenar

In [None]:
mask  = s > 2
print(mask)

s2 = s[mask]
print(s2)
print(s)

In [None]:
s.sort_values()
s.sort_index(ascending=False)

In [None]:
sx = pd.Series(['a', 'b', None, 'c', np.nan, 'd'])
print(sx)

sy = sx.dropna()
print(sy)


In [None]:
help(sx.drop)

## DataFrames

### Creación

In [None]:
help(pd.DataFrame)

In [54]:

diccionario = {
    'nombre': ['María', 'Luis', 'Carmen', 'Antonio'],
    'edad': [18, 22, 20, 21],
    'grado': ['Economía', 'Medicina', 'Arquitectura', 'Economía'],
    'correo': ['maria@gmail.com', 'luis@yahoo.es', 'carmen@gmail.com', 'antonio@gmail.com']
}

lista_de_listas = [['María', 18],
                   ['Luis', 22],
                   ['Carmen', 20]]

lista_de_dicts = [{'Nombre': 'María', 'Edad': 18},
                  {'Nombre': 'Luis', 'Edad': 22},
                  {'Nombre': 'Carmen'}]

In [None]:
df = pd.DataFrame(diccionario)
df

In [None]:
df = pd.DataFrame(lista_de_dicts)
df

In [None]:
df = pd.DataFrame(lista_de_listas, columns=['Nombre','Edad'])
df

#### Desde fuentes

In [None]:
dfcol = pd.read_csv("../data/colesterol.csv")
print(dfcol)

In [None]:
dfcol = pd.read_csv("https://github.com/ricardoahumada/DataScienceBasics/raw/refs/heads/main/data/colesterol.csv")
print(dfcol)

In [None]:
dffr = pd.read_json('../data/frutas.json')
dffr

In [None]:
# !pip install sqlite3
import sqlite3

con_sqlt = sqlite3.connect("../data/database.db")
dfdb = pd.read_sql_query("SELECT * FROM movies", con_sqlt)
con_sqlt.close()

dfdb

In [None]:
# !pip install openpyxl

In [None]:
dfxl = pd.read_excel("../data/FoodMarket.xlsx", sheet_name='Purchases')
dfxl

### Exportar

In [64]:
dfxl.to_json('../data/Purchases.json')

In [None]:
# !pip install pysqlite3

In [None]:
import sqlite3

con_sqlt = sqlite3.connect("../data/database.db")

dffr.to_sql(name="frutas", con=con_sqlt)

# con_sqlt.close()

### Atributos

In [None]:
dfxl.info()
dfcol.info()

In [None]:
dfcol.size
dfcol.shape
dfxl.shape

In [None]:
print(dfxl.columns)
print(dfxl.index)
print(dfxl.axes)
print(dfxl.values)

In [None]:
print(dfcol.dtypes)

In [None]:
dfxl.head(10)

In [None]:
dfxl.tail(10)

In [None]:
dfxl.describe()

### Renombrar y modificar: índices y columnas

In [112]:
dfcol.rename(columns={'nombre': 'nombre y apellido', 'peso': 'pesos'}, index={0: 1000, 10: 101}, inplace=True)

In [None]:
dfcol

In [None]:
dfcol.set_index("nombre y apellido")

In [None]:
dfcol.reindex(index=[4, 3, 1], columns=['edad', 'altura'])

### Acceso y filtrar

In [None]:
help(dfcol.loc)

In [None]:
help(dfcol.iloc)

In [None]:
dfcol

In [None]:
dfcol[['altura','peso']]

In [None]:
dfcol.iloc[1,:3]

In [None]:
dfcol.iloc[1:5, 2:4]

In [None]:
dfcol.loc[2:, ['colesterol','peso']]

In [None]:
mask = dfcol.loc[:, 'peso'] > 70
print(mask)
dfcol[mask]

In [None]:
mask = dfcol.loc[:, ['pesos', 'altura']] > [70, 1.5]
mask
dfcol[mask]

In [None]:
mask = dfcol.isin([20, 1.7,'Javier García Sánchez'])
print(mask)
dfcol[mask]

In [None]:
mask = ~dfcol.isin({'peso':[75,85],'edad':[35]})
print(mask)
dfcol[mask]

In [None]:
dfcol[mask].dropna()

### Operaciones columnas

In [None]:
import pandas as pd
dfcol = pd.read_csv("../data/colesterol.csv")
dfcol.info()

In [None]:
dfcol['edad'].describe()

In [None]:
print(dfcol['edad'].count())
print(dfcol['edad'].mean())
print(dfcol['edad'].std())

In [None]:
dfcol.describe()

In [None]:
dfcol.describe(include='object')

In [None]:
# help(dfcol.select_dtypes)

dfcol.select_dtypes(include=['int64','float64'])

In [None]:
categorias = dfcol.select_dtypes(include=['object'])
categorias.head()

In [None]:
dfcol['imc'] = dfcol['peso']/(dfcol['altura']**2)
dfcol

In [None]:
dfcol.info()
dfcol['edad']=dfcol['edad'].astype('int32')
dfcol.info()
dfcol

In [None]:
dfcol['date']='2024-10-19'
dfcol

# dfcol.info()

In [None]:
dfcol['date']=pd.to_datetime(dfcol['date'], format='%Y-%m-%d')
dfcol.info()

In [None]:
# print(dfcol['date'].dt)
# print(dfcol['date'].dt.year)
# print(dfcol['date'].dt.month)
# print(dfcol['date'].dt.day)
print(dfcol['date'].dt.to_period('M'))

In [None]:
import datetime

ahora = datetime.datetime.now()
print(ahora)

diff_in_days = ahora - dfcol['date']
print(diff_in_days)
print(diff_in_days.dt.days)

In [None]:
from math import log

dfcol['altura'].apply(log)

In [None]:
def calc_es_sobrepeso(val):
    if val < 18.5:
        return 'bajo'
    elif val >= 18.5 and val <= 24.9:
        return 'normal'
    elif val >= 25 and val <= 29.9:
        return 'sobrepeso'
    elif val >= 30:
        return 'obeso'


dfcol['esatdo_peso'] = dfcol['imc'].apply(calc_es_sobrepeso)
dfcol

In [None]:
edad = dfcol.pop('edad')
print(edad)

print(dfcol)

### Operaciones filas

In [None]:
# help(pd.concat)
dfcol

In [None]:
nfila = dfcol.iloc[-1]

dfd = pd.DataFrame([nfila])

pd.concat((dfcol, dfd), ignore_index=True)

In [None]:
dfcol.drop(index=[1,3])

In [None]:
dfcol.dropna()

In [None]:
dfcol.replace({'sexo':{'H':'Hombre','M':'Mujer'}})

## Ordenar

In [None]:
dfcol

In [None]:
dfcol.sort_values(['sexo', 'nombre',  'altura'], ascending=[True, False, True])

### Agrupamiento

In [16]:
import pandas as pd
dfcol = pd.read_csv("../data/colesterol.csv")
dfcol.info()
print(dfcol)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   nombre      14 non-null     object 
 1   edad        14 non-null     int64  
 2   sexo        14 non-null     object 
 3   peso        13 non-null     float64
 4   altura      14 non-null     float64
 5   colesterol  13 non-null     float64
dtypes: float64(3), int64(1), object(2)
memory usage: 800.0+ bytes
                             nombre  edad sexo   peso  altura  colesterol
0      José Luis Martínez Izquierdo    18    H   85.0    1.79       182.0
1                    Rosa Díaz Díaz    32    M   65.0    1.73       232.0
2             Javier García Sánchez    24    H    NaN    1.81       191.0
3               Carmen López Pinzón    35    M   65.0    1.70       200.0
4              Marisa López Collado    46    M   51.0    1.58       148.0
5                 Antonio Ruiz Cruz    68    H   66.0    1.

In [112]:
# help(dfcol.groupby)

In [17]:
print(dfcol.groupby('sexo').groups)
# print(dfcol.groupby('sexo').get_group('H'))
print(dfcol.groupby('sexo').describe())

{'H': [0, 2, 5, 6, 8, 9, 11, 12], 'M': [1, 3, 4, 7, 10, 13]}
      edad                                                        peso  \
     count       mean        std   min    25%   50%    75%   max count   
sexo                                                                     
H      8.0  40.875000  17.699374  18.0  26.25  40.5  52.75  68.0   7.0   
M      6.0  34.666667  13.017936  20.0  24.50  33.5  43.25  53.0   6.0   

                 ...  altura       colesterol                             \
           mean  ...     75%   max      count     mean        std    min   
sexo             ...                                                       
H     80.714286  ...  1.8875  1.98        8.0  228.375  38.407356  182.0   
M     59.500000  ...  1.7225  1.77        5.0  207.200  42.862571  148.0   

                                    
         25%    50%     75%    max  
sexo                                
H     196.25  225.5  255.75  280.0  
M     194.00  200.0  232.00  262.0  

[

In [18]:
print(dfcol.groupby(['sexo','peso']).groups)
print(dfcol.groupby(['sexo','peso']).first())

{('H', 85.0): [0], ('H', nan): [2], ('H', 62.0): [6], ('H', 66.0): [5], ('H', 75.0): [9], ('H', 78.0): [11], ('H', 90.0): [8], ('H', 109.0): [12], ('M', 51.0): [4], ('M', 55.0): [10], ('M', 60.0): [7], ('M', 61.0): [13], ('M', 65.0): [1, 3]}
                                     nombre  edad  altura  colesterol
sexo peso                                                            
H    62.0           Antonio Fernández Ocaña    51    1.72       276.0
     66.0                 Antonio Ruiz Cruz    68    1.74       249.0
     75.0           Santiago Reillo Manzano    46    1.85       280.0
     78.0        José María de la Guía Sanz    58    1.87       198.0
     85.0      José Luis Martínez Izquierdo    18    1.79       182.0
     90.0              Pedro Gálvez Tenorio    35    1.94       241.0
     109.0  Miguel Angel Cuadrado Gutiérrez    27    1.98       210.0
M    51.0              Marisa López Collado    46    1.58       148.0
     55.0             Macarena Álvarez Luna    53    1.62 

In [24]:
import numpy as np
# help(dfcol.aggregate)

# dfcol.aggregate([np.mean])
dfcol.aggregate([np.sum,'min','max'])

  dfcol.aggregate([np.sum,'min','max'])


Unnamed: 0,nombre,edad,sexo,peso,altura,colesterol
sum,José Luis Martínez IzquierdoRosa Díaz DíazJavi...,535,HMHMMHHMHHMHHM,922.0,24.76,2863.0
min,Antonio Fernández Ocaña,18,H,51.0,1.58,148.0
max,Santiago Reillo Manzano,68,M,109.0,1.98,280.0


In [14]:
print(dfcol.groupby('sexo').first())

dfcol.groupby('sexo').agg(np.sum)

                            nombre  edad  peso  altura  colesterol
sexo                                                              
H     José Luis Martínez Izquierdo    18  85.0    1.79       182.0
M                   Rosa Díaz Díaz    32  65.0    1.73       232.0


  dfcol.groupby('sexo').agg(np.sum)


Unnamed: 0_level_0,nombre,edad,peso,altura,colesterol
sexo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
H,José Luis Martínez IzquierdoJavier García Sánc...,327,565.0,14.7,1827.0
M,Rosa Díaz DíazCarmen López PinzónMarisa López ...,208,357.0,10.06,1036.0


In [22]:
dfcol.groupby('sexo').agg({'edad': np.sum, 'colesterol': np.mean})

  dfcol.groupby('sexo').agg({'edad': np.sum, 'colesterol': np.mean})
  dfcol.groupby('sexo').agg({'edad': np.sum, 'colesterol': np.mean})


Unnamed: 0_level_0,edad,colesterol
sexo,Unnamed: 1_level_1,Unnamed: 2_level_1
H,327,228.375
M,208,207.2


In [23]:
print(dfcol.groupby(['sexo','peso']).first())

dfcol.groupby(['sexo','peso']).agg(np.sum)

                                     nombre  edad  altura  colesterol
sexo peso                                                            
H    62.0           Antonio Fernández Ocaña    51    1.72       276.0
     66.0                 Antonio Ruiz Cruz    68    1.74       249.0
     75.0           Santiago Reillo Manzano    46    1.85       280.0
     78.0        José María de la Guía Sanz    58    1.87       198.0
     85.0      José Luis Martínez Izquierdo    18    1.79       182.0
     90.0              Pedro Gálvez Tenorio    35    1.94       241.0
     109.0  Miguel Angel Cuadrado Gutiérrez    27    1.98       210.0
M    51.0              Marisa López Collado    46    1.58       148.0
     55.0             Macarena Álvarez Luna    53    1.62       262.0
     60.0             Pilar Martín González    22    1.66         NaN
     61.0             Carolina Rubio Moreno    20    1.77       194.0
     65.0                    Rosa Díaz Díaz    32    1.73       232.0


  dfcol.groupby(['sexo','peso']).agg(np.sum)


Unnamed: 0_level_0,Unnamed: 1_level_0,nombre,edad,altura,colesterol
sexo,peso,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
H,62.0,Antonio Fernández Ocaña,51,1.72,276.0
H,66.0,Antonio Ruiz Cruz,68,1.74,249.0
H,75.0,Santiago Reillo Manzano,46,1.85,280.0
H,78.0,José María de la Guía Sanz,58,1.87,198.0
H,85.0,José Luis Martínez Izquierdo,18,1.79,182.0
H,90.0,Pedro Gálvez Tenorio,35,1.94,241.0
H,109.0,Miguel Angel Cuadrado Gutiérrez,27,1.98,210.0
M,51.0,Marisa López Collado,46,1.58,148.0
M,55.0,Macarena Álvarez Luna,53,1.62,262.0
M,60.0,Pilar Martín González,22,1.66,0.0


In [32]:
dfcol['rangos_edad'] = pd.cut(x=dfcol['edad'], bins=[1, 20, 40, 60, 80, 100])
dfcol

Unnamed: 0,nombre,edad,sexo,peso,altura,colesterol,rangos_edad
0,José Luis Martínez Izquierdo,18,H,85.0,1.79,182.0,"(1, 20]"
1,Rosa Díaz Díaz,32,M,65.0,1.73,232.0,"(20, 40]"
2,Javier García Sánchez,24,H,,1.81,191.0,"(20, 40]"
3,Carmen López Pinzón,35,M,65.0,1.7,200.0,"(20, 40]"
4,Marisa López Collado,46,M,51.0,1.58,148.0,"(40, 60]"
5,Antonio Ruiz Cruz,68,H,66.0,1.74,249.0,"(60, 80]"
6,Antonio Fernández Ocaña,51,H,62.0,1.72,276.0,"(40, 60]"
7,Pilar Martín González,22,M,60.0,1.66,,"(20, 40]"
8,Pedro Gálvez Tenorio,35,H,90.0,1.94,241.0,"(20, 40]"
9,Santiago Reillo Manzano,46,H,75.0,1.85,280.0,"(40, 60]"


### Concatenación de Dataframes

In [37]:
# help(pd.concat)

In [42]:
df1 = pd.DataFrame({"Nombre": ["Carmen", "Luis"],
                    "Sexo": ["Mujer", "Hombre"], "Edad": [22, 18]}).set_index("Nombre")
df2 = pd.DataFrame({"Nombre": ["María", "Pedro"],
                    "Sexo": ["Mujer", "Hombre"], "Edad": [25, 30]}).set_index("Nombre")

print(df1)
print(df2)

df = pd.concat([df1, df2], axis=0)
df

          Sexo  Edad
Nombre              
Carmen   Mujer    22
Luis    Hombre    18
          Sexo  Edad
Nombre              
María    Mujer    25
Pedro   Hombre    30


Unnamed: 0_level_0,Sexo,Edad
Nombre,Unnamed: 1_level_1,Unnamed: 2_level_1
Carmen,Mujer,22
Luis,Hombre,18
María,Mujer,25
Pedro,Hombre,30


In [46]:
df1 = pd.DataFrame({"Nombre": ["Carmen", "Luis", "María","José"],
                    "Sexo": ["Mujer", "Hombre", "Mujer","Hombre"]}).set_index("Nombre")

df2 = pd.DataFrame({"Nombre": ["Carmen", "Luis", "María"],
                    "Edad": [22, 18, 25]}).set_index("Nombre")

print(df1)
print(df2)

df = pd.concat([df1, df2], axis=1)
df

          Sexo
Nombre        
Carmen   Mujer
Luis    Hombre
María    Mujer
José    Hombre
        Edad
Nombre      
Carmen    22
Luis      18
María     25


Unnamed: 0_level_0,Sexo,Edad
Nombre,Unnamed: 1_level_1,Unnamed: 2_level_1
Carmen,Mujer,22.0
Luis,Hombre,18.0
María,Mujer,25.0
José,Hombre,


In [48]:
help(df.merge)

Help on method merge in module pandas.core.frame:

merge(right: 'DataFrame | Series', how: 'MergeHow' = 'inner', on: 'IndexLabel | AnyArrayLike | None' = None, left_on: 'IndexLabel | AnyArrayLike | None' = None, right_on: 'IndexLabel | AnyArrayLike | None' = None, left_index: 'bool' = False, right_index: 'bool' = False, sort: 'bool' = False, suffixes: 'Suffixes' = ('_x', '_y'), copy: 'bool | None' = None, indicator: 'str | bool' = False, validate: 'MergeValidate | None' = None) -> 'DataFrame' method of pandas.core.frame.DataFrame instance
    Merge DataFrame or named Series objects with a database-style join.
    
    A named Series object is treated as a DataFrame with a single named column.
    
    The join is done on columns or indexes. If joining columns on
    columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
    on indexes or indexes on a column or columns, the index will be passed on.
    When performing a cross merge, no column specifications to me

In [52]:
df1 = pd.DataFrame({"Nombre": ["Carmen", "Luis", "María", "José"],
                    "Sexo": ["Mujer", "Hombre", "Mujer","Hombre"]}).set_index("Nombre")

df2 = pd.DataFrame({"Nombre": ["Carmen", "Luis", "María"],
                    "Edad": [22, 18, 25]}).set_index("Nombre")

print(df1)
print(df2)

df = pd.merge(df1, df2, how="right", on="Nombre")
df

          Sexo
Nombre        
Carmen   Mujer
Luis    Hombre
María    Mujer
José    Hombre
        Edad
Nombre      
Carmen    22
Luis      18
María     25


Unnamed: 0_level_0,Sexo,Edad
Nombre,Unnamed: 1_level_1,Unnamed: 2_level_1
Carmen,Mujer,22
Luis,Hombre,18
María,Mujer,25
