In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/drinks.csv')
movies = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/imdb_1000.csv')
orders = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/chipotle.tsv', sep = '\t')
orders['item_price'] = orders['item_price'] = orders.item_price.str.replace('$', '').astype('float')
stocks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/stocks.csv', parse_dates=['Date'])
titanic = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/titanic_test.csv')
ufo = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/ufo.csv', parse_dates=['Time'])

  orders['item_price'] = orders['item_price'] = orders.item_price.str.replace('$', '').astype('float')


## 1. Ver versiones instaladas

In [3]:
# Para ver la version que tenemos instalada de pandas, debemos escribir lo siguiente:
pd.__version__

'1.4.2'

In [4]:
# Pero si queremos ver las versiones de todas las herramientas de pandas, debemos escribir 
pd.show_versions()




INSTALLED VERSIONS
------------------
commit           : 4bfe3d07b4858144c219b9346329027024102ab6
python           : 3.9.12.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19045
machine          : AMD64
processor        : Intel64 Family 6 Model 37 Stepping 5, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : es_ES.cp1252

pandas           : 1.4.2
numpy            : 1.21.5
pytz             : 2021.3
dateutil         : 2.8.2
pip              : 21.2.4
setuptools       : 61.2.0
Cython           : 0.29.28
pytest           : 7.1.1
hypothesis       : None
sphinx           : 4.4.0
blosc            : None
feather          : None
xlsxwriter       : 3.0.3
lxml.etree       : 4.8.0
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 2.11.3
IPython          : 8.2.0
pandas_datareader: None
bs4              : 4.11.1
bottleneck       : 1.3.4
brotli       

## 2. Crear un Data Frame de prueba

In [5]:
# Podemos crear un data frame con un diccionario. Las columnas seran los keys y los datos los values
df = pd.DataFrame({'columna_1': [100,200,300], 'columna_2':[300,200,100]})
df

Unnamed: 0,columna_1,columna_2
0,100,300
1,200,200
2,300,100


In [6]:
# Tambien podemos crear un data frame aleatorio uilizando funciones de numpy: np.random.rand(filas, columnas)
pd.DataFrame(np.random.rand(4, 8))

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.950596,0.673328,0.422052,0.775552,0.67407,0.222658,0.429704,0.13624
1,0.946308,0.352042,0.605602,0.722373,0.062533,0.115869,0.893558,0.290281
2,0.329783,0.604028,0.154437,0.361915,0.496707,0.40166,0.274795,0.408778
3,0.903258,0.362211,0.488569,0.423574,0.709315,0.138233,0.485325,0.933044


In [7]:
# Podemos hacer algo similar, pero agregando los nombres de las columnas
pd.DataFrame(np.random.rand(4,8), columns = list('abcdefgh'))

Unnamed: 0,a,b,c,d,e,f,g,h
0,0.641313,0.617222,0.849371,0.917547,0.014057,0.545698,0.240739,0.031269
1,0.582323,0.394379,0.149707,0.60474,0.951272,0.828347,0.380867,0.723077
2,0.870895,0.943104,0.293363,0.425199,0.007399,0.666759,0.866216,0.702394
3,0.850034,0.491332,0.908474,0.42252,0.137314,0.93675,0.207066,0.527029


## 3. Renombrar columnas

In [8]:
df

Unnamed: 0,columna_1,columna_2
0,100,300
1,200,200
2,300,100


In [9]:
# Para renombrar columnas, el metodo mas rapido si son pocas columnas es uilizar rename()
df = df.rename({'columna_1':'col1', 'columna_2':'col2'}, axis='columns')

In [10]:
df

Unnamed: 0,col1,col2
0,100,300
1,200,200
2,300,100


In [11]:
# Otra opcion bastante util es utilizar 'df.columns' y colocar los nombres nuevos en una lista
df.columns = ['columna_1', 'columna_2']

In [12]:
df

Unnamed: 0,columna_1,columna_2
0,100,300
1,200,200
2,300,100


In [13]:
# La utilidad de df.columns es que permite aplicar metodos de 'str' de forma rapida y hacer cambios en los 
# nombres de las columnas. Por ejmploe, reemplazar un guion bajo, por un guion comun
df.columns = df.columns.str.replace('_', '-')

In [14]:
df

Unnamed: 0,columna-1,columna-2
0,100,300
1,200,200
2,300,100


In [15]:
# Una ultima opcion para modificar los nombres de las columnas, es agregar prefijos o sufijos
df.add_prefix('X_')

Unnamed: 0,X_columna-1,X_columna-2
0,100,300
1,200,200
2,300,100


In [16]:
df.add_suffix('_Y')

Unnamed: 0,columna-1_Y,columna-2_Y
0,100,300
1,200,200
2,300,100


## 4. Colocar las filas en orden inverso (lo que esta al final de primero)

In [17]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [18]:
# En algunos casos, puede ser necesario colocar las filas en orden inverso (lo que esta al final de primero)
# Para ello debemos utilizar .loc y pasarle este argumento: ::-1
drinks.loc[::-1]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
192,Zimbabwe,64,18,4,4.7,Africa
191,Zambia,32,19,4,2.5,Africa
190,Yemen,6,0,0,0.1,Asia
189,Vietnam,111,2,1,2.0,Asia
188,Venezuela,333,100,3,7.7,South America
...,...,...,...,...,...,...
4,Angola,217,57,45,5.9,Africa
3,Andorra,245,138,312,12.4,Europe
2,Algeria,25,0,14,0.7,Africa
1,Albania,89,132,54,4.9,Europe


In [19]:
# Si queremos cambiar el indice, debemos resetear el que tenemos primero
drinks.loc[::-1].reset_index(drop = True).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Zimbabwe,64,18,4,4.7,Africa
1,Zambia,32,19,4,2.5,Africa
2,Yemen,6,0,0,0.1,Asia
3,Vietnam,111,2,1,2.0,Asia
4,Venezuela,333,100,3,7.7,South America


## 5. Colocar las columnas en orden inverso

In [20]:
# Podemos aplicar un truco similar al anterior, para ver los columnas en orden inverso, solo tendriamos que
# agregar a la funcion .loc :,::-1
drinks.loc[:, ::-1]

Unnamed: 0,continent,total_litres_of_pure_alcohol,wine_servings,spirit_servings,beer_servings,country
0,Asia,0.0,0,0,0,Afghanistan
1,Europe,4.9,54,132,89,Albania
2,Africa,0.7,14,0,25,Algeria
3,Europe,12.4,312,138,245,Andorra
4,Africa,5.9,45,57,217,Angola
...,...,...,...,...,...,...
188,South America,7.7,3,100,333,Venezuela
189,Asia,2.0,1,2,111,Vietnam
190,Asia,0.1,0,0,6,Yemen
191,Africa,2.5,4,19,32,Zambia


## 6. Seleccionar columnas por tipo de dato

In [21]:
# Veamos los tipos de datos de la tabla drinks
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [22]:
# Podemos utilizar la funcion 'select_dtypes()', para seleccionar tipos de datos especificos
drinks.select_dtypes('object')

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa
...,...,...
188,Venezuela,South America
189,Vietnam,Asia
190,Yemen,Asia
191,Zambia,Africa


In [23]:
drinks.select_dtypes(include = 'number')

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9
...,...,...,...,...
188,333,100,3,7.7
189,111,2,1,2.0
190,6,0,0,0.1
191,32,19,4,2.5


In [24]:
# Si queremos incluir mas de un tipo de dato, debemos utilizar una lista
drinks.select_dtypes(include = ['float', 'object'])

Unnamed: 0,country,total_litres_of_pure_alcohol,continent
0,Afghanistan,0.0,Asia
1,Albania,4.9,Europe
2,Algeria,0.7,Africa
3,Andorra,12.4,Europe
4,Angola,5.9,Africa
...,...,...,...
188,Venezuela,7.7,South America
189,Vietnam,2.0,Asia
190,Yemen,0.1,Asia
191,Zambia,2.5,Africa


In [25]:
# Tambien podemos hacer lo contrario, y excluir un tipo de dato especifico
drinks.select_dtypes(exclude = 'object')

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9
...,...,...,...,...
188,333,100,3,7.7
189,111,2,1,2.0
190,6,0,0,0.1
191,32,19,4,2.5


## 7. Convertir strings a numeros

In [26]:
# Vamos a crear un data frame de prueba
df = pd.DataFrame({'col_one':['1.1', '2.2', '3.3'],
                   'col_two':['4.4', '5.5', '6.6'],
                   'col_three':['7.7', '8.8', '-']})
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,-


In [27]:
# En este data frame, los numeros estan almacenados como string
df.dtypes

col_one      object
col_two      object
col_three    object
dtype: object

In [28]:
# Para convertir las columnas a numero, podemos utilizar la funcion 'astype()'
df.astype({'col_one': 'float', 'col_two':'float'}).dtypes

col_one      float64
col_two      float64
col_three     object
dtype: object

In [29]:
# Sin embargo, astype() tiene un problema, y es que no pueden existir valores nulos
# En este sentido, es mejor utilizar pd.to_numeric() que convierte los valores nulos en NaN
pd.to_numeric(df['col_three'], errors = 'coerce')

0    7.7
1    8.8
2    NaN
Name: col_three, dtype: float64

In [30]:
# Si sabemos de antemano que queremos colocar en los NaN, podemos utilizar fillna() para rellenarlo
pd.to_numeric(df['col_three'], errors = 'coerce').fillna(0)

0    7.7
1    8.8
2    0.0
Name: col_three, dtype: float64

In [31]:
# Por ultimo, podemos aplicar esta funcion a todo el data frame utilizando "apply()"
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,0.0


In [32]:
df.dtypes

col_one      float64
col_two      float64
col_three    float64
dtype: object

## 8. Reducir el tamano del data frame

In [33]:
# Utilizando info() y el argumento memory_usage='deep', podemos saber cuanta memoria estamos consumiendo
drinks.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [34]:
# Una forma sencilla de solucionar este problema, es solo cargar las columnas necesarias y no todo el data frame
cols = ['beer_servings', 'continent']
small_drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/drinks.csv', usecols=cols)
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   beer_servings  193 non-null    int64 
 1   continent      193 non-null    object
dtypes: int64(1), object(1)
memory usage: 13.7 KB


In [35]:
# Otra forma comun de reducir espacio, es convetir variables de object en categorias (solo si son pocas categorias)
dtypes = {'continent':'category'}
smaller_drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/drinks.csv', 
                             usecols=cols, dtype=dtypes)
smaller_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   beer_servings  193 non-null    int64   
 1   continent      193 non-null    category
dtypes: category(1), int64(1)
memory usage: 2.4 KB


## 9. Crear un DataFrame a partir de varios archivos (en filas)

In [36]:
# Primero vamos a cargar los data frame
pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/stocks1.csv')

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT


In [37]:
pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/stocks2.csv')

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-04,113.0,29736800,AAPL
1,2016-10-04,57.24,20085900,MSFT
2,2016-10-04,31.35,18460400,CSCO


In [38]:
pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/stocks3.csv')

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-05,57.64,16726400,MSFT
1,2016-10-05,31.59,11808600,CSCO
2,2016-10-05,113.05,21453100,AAPL


In [39]:
# Para unir los tres data frame de forma rapida utilizaremos: from glob import glob
from glob import glob

In [40]:
stock_files = sorted(glob('stocks*.csv'))
stock_files

[]

In [41]:
#pd.concat((pd.read_csv(file) for file in stock_files))

In [42]:
#pd.concat((pd.read_csv(file) for file in stock_files), ignore_index=True)

## 10. Dividir un DataFrame en dos subconjuntos aleatorios

In [43]:
# Supongamos que queremos dividir un data frame en dos partes, una con el 25% y otra con el 75% de los datos
# Para ello podemos utilizar la funcion 'sample()' y el argumento frac=. Agreamos random_state=1234, para decirle
# que cree secuencias de numeros aleatorios similares
movies.shape

(979, 6)

In [44]:
# Ahora vamos a sacar el 75%
movies1 = movies.sample(frac=0.75, random_state=1234)
movies1

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
387,8.0,Midnight Cowboy,X,Drama,113,"[u'Dustin Hoffman', u'Jon Voight', u'Sylvia Mi..."
653,7.7,Fearless,PG-13,Action,104,"[u'Jet Li', u'Li Sun', u'Yong Dong']"
40,8.5,The Green Mile,R,Crime,189,"[u'Tom Hanks', u'Michael Clarke Duncan', u'Dav..."
913,7.5,Suspiria,X,Horror,92,"[u'Jessica Harper', u'Stefania Casini', u'Flav..."
766,7.6,The Little Mermaid,G,Animation,83,"[u'Jodi Benson', u'Samuel E. Wright', u'Rene A..."
...,...,...,...,...,...,...
368,8.0,Planet of the Apes,G,Adventure,112,"[u'Charlton Heston', u'Roddy McDowall', u'Kim ..."
505,7.8,About Time,R,Drama,123,"[u'Domhnall Gleeson', u'Rachel McAdams', u'Bil..."
595,7.7,The Purple Rose of Cairo,PG,Comedy,82,"[u'Mia Farrow', u'Jeff Daniels', u'Danny Aiello']"
940,7.4,Much Ado About Nothing,PG-13,Comedy,111,"[u'Kenneth Branagh', u'Emma Thompson', u'Keanu..."


In [45]:
# Pero si queremos solo el 25% restante, lo que hacemos es sacarle al data frame original ese 75%
movies2 = movies.drop(movies1.index)
movies2

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
10,8.8,The Lord of the Rings: The Fellowship of the Ring,PG-13,Adventure,178,"[u'Elijah Wood', u'Ian McKellen', u'Orlando Bl..."
12,8.8,Star Wars: Episode V - The Empire Strikes Back,PG,Action,124,"[u'Mark Hamill', u'Harrison Ford', u'Carrie Fi..."
...,...,...,...,...,...,...
954,7.4,X-Men,PG-13,Action,104,"[u'Patrick Stewart', u'Hugh Jackman', u'Ian Mc..."
960,7.4,The Way Way Back,PG-13,Comedy,103,"[u'Steve Carell', u'Toni Collette', u'Allison ..."
968,7.4,The English Patient,R,Drama,162,"[u'Ralph Fiennes', u'Juliette Binoche', u'Will..."
970,7.4,Wonder Boys,R,Drama,107,"[u'Michael Douglas', u'Tobey Maguire', u'Franc..."


## 11. Filtrar un data frame por multiples categorias

In [46]:
# Supongamos que tenemos una tabla, que tiene valores categoricos, de los cuales nos interesa seleccionar solo alguno
movies['genre'].unique()

array(['Crime', 'Action', 'Drama', 'Western', 'Adventure', 'Biography',
       'Comedy', 'Animation', 'Mystery', 'Horror', 'Film-Noir', 'Sci-Fi',
       'History', 'Thriller', 'Family', 'Fantasy'], dtype=object)

In [47]:
# Para poder hacer esto tenemos dos opciones
# La primera opcion es la mas larga y complicada, y es, realizar multiples consultas con el operador OR - |
movies[(movies['genre']=='Action') | (movies['genre']=='Horror') | (movies['genre']=='Drama')]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."
12,8.8,Star Wars: Episode V - The Empire Strikes Back,PG,Action,124,"[u'Mark Hamill', u'Harrison Ford', u'Carrie Fi..."
...,...,...,...,...,...,...
970,7.4,Wonder Boys,R,Drama,107,"[u'Michael Douglas', u'Tobey Maguire', u'Franc..."
972,7.4,Blue Valentine,NC-17,Drama,112,"[u'Ryan Gosling', u'Michelle Williams', u'John..."
973,7.4,The Cider House Rules,PG-13,Drama,126,"[u'Tobey Maguire', u'Charlize Theron', u'Micha..."
976,7.4,Master and Commander: The Far Side of the World,PG-13,Action,138,"[u'Russell Crowe', u'Paul Bettany', u'Billy Bo..."


In [48]:
# La segunda dos opcion, es mucho mas simple y directa. Se trata de utilizar una lista con las categorias
# y la funcion isin()
movies[movies['genre'].isin(['Action', 'Drama', 'Horror'])]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."
12,8.8,Star Wars: Episode V - The Empire Strikes Back,PG,Action,124,"[u'Mark Hamill', u'Harrison Ford', u'Carrie Fi..."
...,...,...,...,...,...,...
970,7.4,Wonder Boys,R,Drama,107,"[u'Michael Douglas', u'Tobey Maguire', u'Franc..."
972,7.4,Blue Valentine,NC-17,Drama,112,"[u'Ryan Gosling', u'Michelle Williams', u'John..."
973,7.4,The Cider House Rules,PG-13,Drama,126,"[u'Tobey Maguire', u'Charlize Theron', u'Micha..."
976,7.4,Master and Commander: The Far Side of the World,PG-13,Action,138,"[u'Russell Crowe', u'Paul Bettany', u'Billy Bo..."


In [49]:
# Si queremos, podemos negar este filtro utilizando una virgulilla "~" al principio
movies[~movies['genre'].isin(['Action', 'Drama', 'Horror'])]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
...,...,...,...,...,...,...
969,7.4,Law Abiding Citizen,R,Crime,109,"[u'Gerard Butler', u'Jamie Foxx', u'Leslie Bibb']"
971,7.4,Death at a Funeral,R,Comedy,90,"[u'Matthew Macfadyen', u'Peter Dinklage', u'Ew..."
974,7.4,Tootsie,PG,Comedy,116,"[u'Dustin Hoffman', u'Jessica Lange', u'Teri G..."
975,7.4,Back to the Future Part III,PG,Adventure,118,"[u'Michael J. Fox', u'Christopher Lloyd', u'Ma..."


## 12. Filtrar un data frame por la categorias de mayor valor - nlargest()

In [51]:
# Veamos los valores unicos de la columna 'genre' en la tabla movies
movies['genre'].value_counts()

Drama        278
Comedy       156
Action       136
Crime        124
Biography     77
Adventure     75
Animation     62
Horror        29
Mystery       16
Western        9
Sci-Fi         5
Thriller       5
Film-Noir      3
Family         2
History        1
Fantasy        1
Name: genre, dtype: int64

In [55]:
# Supongamos que queremos filtrar una serie con los cinco valores mas altos
# podriamos utilizar head, pero tambien nlargest()
movies['genre'].value_counts().nlargest(5)

Drama        278
Comedy       156
Action       136
Crime        124
Biography     77
Name: genre, dtype: int64

## 13. Manejar valores nulos

In [57]:
# Veamos la tabla ufo
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,1930-06-01 22:00:00
1,Willingboro,,OTHER,NJ,1930-06-30 20:00:00
2,Holyoke,,OVAL,CO,1931-02-15 14:00:00
3,Abilene,,DISK,KS,1931-06-01 13:00:00
4,New York Worlds Fair,,LIGHT,NY,1933-04-18 19:00:00


In [58]:
# Vamos a ver cuantos valores nulos hay en cada columna
ufo.isna().sum()

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [59]:
# Si queremos ver el porcentaje de valores de cada columna, en relacion al total de datos, 
# tenemos que utilizar el metodo mean()
ufo.isna().mean()

City               0.001371
Colors Reported    0.842004
Shape Reported     0.144948
State              0.000000
Time               0.000000
dtype: float64

In [60]:
# Si desea descartar columnas en las que falta más del 10% de los valores, puede establecer un umbral para dropna()
ufo.dropna(thresh=len(ufo)*0.9, axis='columns').head()

Unnamed: 0,City,State,Time
0,Ithaca,NY,1930-06-01 22:00:00
1,Willingboro,NJ,1930-06-30 20:00:00
2,Holyoke,CO,1931-02-15 14:00:00
3,Abilene,KS,1931-06-01 13:00:00
4,New York Worlds Fair,NY,1933-04-18 19:00:00


## 14. Dividir un string en multiples columnas

In [61]:
# Crearemos un data frame de prueba
df = pd.DataFrame({'name':['John Arthur Doe', 'Jane Ann Smith'],
                   'location':['Los Angeles, CA', 'Washington, DC']})
df

Unnamed: 0,name,location
0,John Arthur Doe,"Los Angeles, CA"
1,Jane Ann Smith,"Washington, DC"


In [62]:
# Para dividir un string en multiples columnas, debemos utilizar 'str.split()' y el argumento expand=True
df['name'].str.split(' ', expand = True)

Unnamed: 0,0,1,2
0,John,Arthur,Doe
1,Jane,Ann,Smith


In [63]:
# Si queremos ser mas especificos, podemos crear las columnas que contendran esos valores que separaremos
df[['first', 'middle', 'surname']] = df['name'].str.split(' ', expand = True)

In [64]:
df

Unnamed: 0,name,location,first,middle,surname
0,John Arthur Doe,"Los Angeles, CA",John,Arthur,Doe
1,Jane Ann Smith,"Washington, DC",Jane,Ann,Smith


In [67]:
# Una ultima opcion es aplicar str.split pero solo quedarnos con unos de los valores cuando hacemos la separacion
# para ello definimos el indice numerico del elemento que queremos conservar
df['city'] = df['location'].str.split(',', expand = True)[0]

In [68]:
df

Unnamed: 0,name,location,first,middle,surname,city
0,John Arthur Doe,"Los Angeles, CA",John,Arthur,Doe,Los Angeles
1,Jane Ann Smith,"Washington, DC",Jane,Ann,Smith,Washington


## 15. Pandas profiling

In [1]:
# import pandas_profiling