# Introduction to Data Analysis with Python

# Objective:

* Handle tabular data with `pandas`

## `pandas`

### Getting started with pandas

In [1]:
import pandas as pd
import numpy as np

### `pandas` data structures

### Series

The base pandas abstraction. You can thing of it as the love child of a numpy array and a dictionary.

In [2]:
s = pd.Series([4, 7, -5, 3])
s

0    4
1    7
2   -5
3    3
dtype: int64

If we provide an index, pandas will use it. If not, it will automatically create one.

In [3]:
print(s.index)
print(s.values)

RangeIndex(start=0, stop=4, step=1)
[ 4  7 -5  3]


In [4]:
s2 = pd.Series([1, 2, 4.5, 7, 2, 23, 15], index=['i', 'f', 'n', 'e', 'u', 'r', 'h'])
s2

i     1.0
f     2.0
n     4.5
e     7.0
u     2.0
r    23.0
h    15.0
dtype: float64

In [5]:
#nueva linea

In [6]:
s2['r']

23.0

In [7]:
s2 > 3

i    False
f    False
n     True
e     True
u    False
r     True
h     True
dtype: bool

In [8]:
s2[s2>3]#genero a aprtir de una concidicion

n     4.5
e     7.0
r    23.0
h    15.0
dtype: float64

In [94]:
evens = s2 % 2 == 0# puedo tambn guardarlo en una variable

In [10]:
s2[evens]

f    2.0
u    2.0
dtype: float64

In [11]:
s2

i     1.0
f     2.0
n     4.5
e     7.0
u     2.0
r    23.0
h    15.0
dtype: float64

In [12]:
s2 * 2

i     2.0
f     4.0
n     9.0
e    14.0
u     4.0
r    46.0
h    30.0
dtype: float64

In [13]:
np.exp(s2)

i    2.718282e+00
f    7.389056e+00
n    9.001713e+01
e    1.096633e+03
u    7.389056e+00
r    9.744803e+09
h    3.269017e+06
dtype: float64

In [14]:
'f' in s2

True

In [15]:
clase = pd.Series([35, 22, 45, 72], index=['Toni', 'Fulanito', 'Menganito', 'Victor'])

In [16]:
clase

Toni         35
Fulanito     22
Menganito    45
Victor       72
dtype: int64

In [17]:
clase[clase==22].index

Index(['Fulanito'], dtype='object')

We can create Series from dictionaries:

In [18]:
sdata = {'B' : 3e6, 'M': 6e6, 'P': 1.2e5, 'V': 7e5}#creamos un diccionario

s3 = pd.Series(sdata)#transformamos el diccoinario en una serie
s3

B    3000000.0
M    6000000.0
P     120000.0
V     700000.0
dtype: float64

In [19]:
increase = {'M': 4e5, 'B' : 2e5, 'Z': -2e4}

s4 = pd.Series(increase)

And here is where the magic happens: numpy arrays only identify their contents by position. In contrast, pandas knows their "name" and will align them based on their indexes:

In [20]:
s3.values

array([3000000., 6000000.,  120000.,  700000.])

In [21]:
s4.values

array([400000., 200000., -20000.])

In [22]:
s3.values + s4.values#falla porque no tengo el mismo numero de elementos en cada array

ValueError: operands could not be broadcast together with shapes (4,) (3,) 

In [23]:
s3 + s4# esto si me deja, porque estoy sumando series, sumara cada uno con el correspondiente y si no hay concincidencia -> NaN

B    3200000.0
M    6400000.0
P          NaN
V          NaN
Z          NaN
dtype: float64

In [24]:
#Podemos cmabiar el nombre de las series
#o tambn el nombre del indice
s3.name = 'population_2000'
s3.index.name = 'province'

In [25]:
s3

province
B    3000000.0
M    6000000.0
P     120000.0
V     700000.0
Name: population_2000, dtype: float64

In [26]:
s3.index

Index(['B', 'M', 'P', 'V'], dtype='object', name='province')

### DataFrame

This is the object you'll work most of the time with. It represents a table of _m_ observations x _n_ variables. Each variable, or column, is a Series.

In [27]:
#Apartir de las series podemos construir dataframes(casi simepre los leeremos de un csv, pero ahora lo vamos a ver si lo creamos nosotros de cero)
dfdata = {
    'province' : ['M', 'M', 'M', 'B', 'B'],
    'population': [1.5e6, 2e6, 3e6, 5e5, 1.5e6],
    'year' : [1900, 1950, 2000, 1900, 2000]   
}

df = pd.DataFrame(dfdata)
df

Unnamed: 0,province,population,year
0,M,1500000.0,1900
1,M,2000000.0,1950
2,M,3000000.0,2000
3,B,500000.0,1900
4,B,1500000.0,2000


In [28]:
#puedo renombrar columnas, y si existre la deja, y si no la crea con NaN
df2 = pd.DataFrame(dfdata, columns=['province','population', 'year', 'debt'])
df2

Unnamed: 0,province,population,year,debt
0,M,1500000.0,1900,
1,M,2000000.0,1950,
2,M,3000000.0,2000,
3,B,500000.0,1900,
4,B,1500000.0,2000,


In [29]:
#otra forma, es crearla dando por hecho que existe, en este caso creamos una nueva columna a partir de otra en este caso population

df2['kk'] = dfdata['population']

In [30]:
df2

Unnamed: 0,province,population,year,debt,kk
0,M,1500000.0,1900,,1500000.0
1,M,2000000.0,1950,,2000000.0
2,M,3000000.0,2000,,3000000.0
3,B,500000.0,1900,,500000.0
4,B,1500000.0,2000,,1500000.0


In [31]:
df2.index

RangeIndex(start=0, stop=5, step=1)

In [32]:
df2.columns

Index(['province', 'population', 'year', 'debt', 'kk'], dtype='object')

In [33]:
df2[['population','province']]
# asi elegimos mas de una columna a la vez, HAY QUE PONER 2 CORCHETES, 
#lo que estoy haciendo es seleccionar una lista dentro del dataframe, ya que ['population','province'] es una lista

Unnamed: 0,population,province
0,1500000.0,M
1,2000000.0,M
2,3000000.0,M
3,500000.0,B
4,1500000.0,B


In [34]:
df2.population
#otra forma de acceder a una columna igual que df2['population']t. 
#Si el nombre de la columna empieza con numero no se puede acceder de esta manera, habria que usar la otra forma

0    1500000.0
1    2000000.0
2    3000000.0
3     500000.0
4    1500000.0
Name: population, dtype: float64

In [35]:
df2['population']

0    1500000.0
1    2000000.0
2    3000000.0
3     500000.0
4    1500000.0
Name: population, dtype: float64

In [36]:
df2['2nd_language']=list('EEFFG')

In [37]:
df2

Unnamed: 0,province,population,year,debt,kk,2nd_language
0,M,1500000.0,1900,,1500000.0,E
1,M,2000000.0,1950,,2000000.0,E
2,M,3000000.0,2000,,3000000.0,F
3,B,500000.0,1900,,500000.0,F
4,B,1500000.0,2000,,1500000.0,G


In [38]:
df2['2nd_language']

0    E
1    E
2    F
3    F
4    G
Name: 2nd_language, dtype: object

In [39]:
df2.2nd_language# como habiamos comentado antes -> FALLA

SyntaxError: invalid syntax (4183479724.py, line 1)

In [40]:
# df2['abs']

In [41]:
df2.index = list('abcde')

In [42]:
df2

Unnamed: 0,province,population,year,debt,kk,2nd_language
a,M,1500000.0,1900,,1500000.0,E
b,M,2000000.0,1950,,2000000.0,E
c,M,3000000.0,2000,,3000000.0,F
d,B,500000.0,1900,,500000.0,F
e,B,1500000.0,2000,,1500000.0,G


In [43]:
df2.loc['a']
# para poder extraer filas usamos loc, en este caso 'a' es la posicion del indice que quiero extraer

province                M
population      1500000.0
year                 1900
debt                  NaN
kk              1500000.0
2nd_language            E
Name: a, dtype: object

In [44]:
#puedo realizar asignaciones
df2['debt'] = 10
df2

Unnamed: 0,province,population,year,debt,kk,2nd_language
a,M,1500000.0,1900,10,1500000.0,E
b,M,2000000.0,1950,10,2000000.0,E
c,M,3000000.0,2000,10,3000000.0,F
d,B,500000.0,1900,10,500000.0,F
e,B,1500000.0,2000,10,1500000.0,G


In [45]:
df2['capital'] = df2['province'] == 'M'
df2

Unnamed: 0,province,population,year,debt,kk,2nd_language,capital
a,M,1500000.0,1900,10,1500000.0,E,True
b,M,2000000.0,1950,10,2000000.0,E,True
c,M,3000000.0,2000,10,3000000.0,F,True
d,B,500000.0,1900,10,500000.0,F,False
e,B,1500000.0,2000,10,1500000.0,G,False


In [46]:
df2['debt'] = [1,0,2,.5,.7]# podemos asignar con una array o una lista
df2

Unnamed: 0,province,population,year,debt,kk,2nd_language,capital
a,M,1500000.0,1900,1.0,1500000.0,E,True
b,M,2000000.0,1950,0.0,2000000.0,E,True
c,M,3000000.0,2000,2.0,3000000.0,F,True
d,B,500000.0,1900,0.5,500000.0,F,False
e,B,1500000.0,2000,0.7,1500000.0,G,False


# df2.T

In [47]:
#podemos crear la traspuesta del dataframe, en el que se cambiaran filas por columnas
df2T= df2.T

In [48]:
df2T.index

Index(['province', 'population', 'year', 'debt', 'kk', '2nd_language',
       'capital'],
      dtype='object')

In [49]:
df2

Unnamed: 0,province,population,year,debt,kk,2nd_language,capital
a,M,1500000.0,1900,1.0,1500000.0,E,True
b,M,2000000.0,1950,0.0,2000000.0,E,True
c,M,3000000.0,2000,2.0,3000000.0,F,True
d,B,500000.0,1900,0.5,500000.0,F,False
e,B,1500000.0,2000,0.7,1500000.0,G,False


In [50]:
df2.describe()
#describe va a coger todas las columnas que sean numericas, y te da una informacion acerca de ellas como count, mean,...


Unnamed: 0,population,year,debt,kk
count,5.0,5.0,5.0,5.0
mean,1700000.0,1950.0,0.84,1700000.0
std,908295.1,50.0,0.74364,908295.1
min,500000.0,1900.0,0.0,500000.0
25%,1500000.0,1900.0,0.5,1500000.0
50%,1500000.0,1950.0,0.7,1500000.0
75%,2000000.0,2000.0,1.0,2000000.0
max,3000000.0,2000.0,2.0,3000000.0


In [51]:
df2.describe().T
# como el describe me devuelve un dataframe, puedo hacer la traspuesta por si prefierlo leerlo de otra manera

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
population,5.0,1700000.0,908295.106229,500000.0,1500000.0,1500000.0,2000000.0,3000000.0
year,5.0,1950.0,50.0,1900.0,1900.0,1950.0,2000.0,2000.0
debt,5.0,0.84,0.74364,0.0,0.5,0.7,1.0,2.0
kk,5.0,1700000.0,908295.106229,500000.0,1500000.0,1500000.0,2000000.0,3000000.0


### Index objects

Indexes are immutable.

In [52]:
df2.index[1] = 'x'# como son elementos inmutables no puedo modificarlo de esta forma

TypeError: Index does not support mutable operations

In [53]:
df2.index[1]

'b'

In [54]:
df2.iloc[2:]# asi puedo seleccionar filas

Unnamed: 0,province,population,year,debt,kk,2nd_language,capital
c,M,3000000.0,2000,2.0,3000000.0,F,True
d,B,500000.0,1900,0.5,500000.0,F,False
e,B,1500000.0,2000,0.7,1500000.0,G,False


### Dropping entries from an axis

In [55]:
s5 = pd.Series(np.arange(5), list('jduvk'))
s5

j    0
d    1
u    2
v    3
k    4
dtype: int64

In [56]:
s6 = s5.drop(['d','k'])
s6

j    0
u    2
v    3
dtype: int64

In [57]:
s5

j    0
d    1
u    2
v    3
k    4
dtype: int64

In [58]:
#existen algunos metodos que modifican tambn el original, es lo que se llama inplace, y puedo modificarlo a True, 
#pero por defecto es False

In [59]:
s5.drop(['d','k'],inplace=False)

j    0
u    2
v    3
dtype: int64

In [60]:
s5

j    0
d    1
u    2
v    3
k    4
dtype: int64

By default, `drop()` doesn't modify the original Series- it creates a copy. We can change that with the argument `inplace`.

In [61]:
s5

j    0
d    1
u    2
v    3
k    4
dtype: int64

In [62]:
s6['u'] = 7
s6

j    0
u    7
v    3
dtype: int64

In [63]:
df2

Unnamed: 0,province,population,year,debt,kk,2nd_language,capital
a,M,1500000.0,1900,1.0,1500000.0,E,True
b,M,2000000.0,1950,0.0,2000000.0,E,True
c,M,3000000.0,2000,2.0,3000000.0,F,True
d,B,500000.0,1900,0.5,500000.0,F,False
e,B,1500000.0,2000,0.7,1500000.0,G,False


In [64]:
df2.drop('c')
# en data frames tenemos que tener en cuenta si queremos borrar una fila o una columna, debo tener cuidado al especificarlo

Unnamed: 0,province,population,year,debt,kk,2nd_language,capital
a,M,1500000.0,1900,1.0,1500000.0,E,True
b,M,2000000.0,1950,0.0,2000000.0,E,True
d,B,500000.0,1900,0.5,500000.0,F,False
e,B,1500000.0,2000,0.7,1500000.0,G,False


In [65]:
df2

Unnamed: 0,province,population,year,debt,kk,2nd_language,capital
a,M,1500000.0,1900,1.0,1500000.0,E,True
b,M,2000000.0,1950,0.0,2000000.0,E,True
c,M,3000000.0,2000,2.0,3000000.0,F,True
d,B,500000.0,1900,0.5,500000.0,F,False
e,B,1500000.0,2000,0.7,1500000.0,G,False


In [66]:
df2.drop('c', axis=0)# si lo indico asi me refiero a filas

Unnamed: 0,province,population,year,debt,kk,2nd_language,capital
a,M,1500000.0,1900,1.0,1500000.0,E,True
b,M,2000000.0,1950,0.0,2000000.0,E,True
d,B,500000.0,1900,0.5,500000.0,F,False
e,B,1500000.0,2000,0.7,1500000.0,G,False


In [67]:
df3 = df2.drop('2nd_language', axis=1)# si lo indico asi me refiero a columnas

In [68]:
# por defecto es axis = 0, osea afectas a la fila

In [69]:
df3

Unnamed: 0,province,population,year,debt,kk,capital
a,M,1500000.0,1900,1.0,1500000.0,True
b,M,2000000.0,1950,0.0,2000000.0,True
c,M,3000000.0,2000,2.0,3000000.0,True
d,B,500000.0,1900,0.5,500000.0,False
e,B,1500000.0,2000,0.7,1500000.0,False


# Remember the df.copy() *issue*

In [70]:
df4 = df3

In [71]:
df4.drop(['a','b'],inplace=True)

In [72]:
df4

Unnamed: 0,province,population,year,debt,kk,capital
c,M,3000000.0,2000,2.0,3000000.0,True
d,B,500000.0,1900,0.5,500000.0,False
e,B,1500000.0,2000,0.7,1500000.0,False


In [73]:
df3

Unnamed: 0,province,population,year,debt,kk,capital
c,M,3000000.0,2000,2.0,3000000.0,True
d,B,500000.0,1900,0.5,500000.0,False
e,B,1500000.0,2000,0.7,1500000.0,False


In [74]:
df3 = df2.copy()
df3

Unnamed: 0,province,population,year,debt,kk,2nd_language,capital
a,M,1500000.0,1900,1.0,1500000.0,E,True
b,M,2000000.0,1950,0.0,2000000.0,E,True
c,M,3000000.0,2000,2.0,3000000.0,F,True
d,B,500000.0,1900,0.5,500000.0,F,False
e,B,1500000.0,2000,0.7,1500000.0,G,False


In [75]:
df3.drop('capital', axis=1, inplace=True)
df3

Unnamed: 0,province,population,year,debt,kk,2nd_language
a,M,1500000.0,1900,1.0,1500000.0,E
b,M,2000000.0,1950,0.0,2000000.0,E
c,M,3000000.0,2000,2.0,3000000.0,F
d,B,500000.0,1900,0.5,500000.0,F
e,B,1500000.0,2000,0.7,1500000.0,G


In [76]:
df2

Unnamed: 0,province,population,year,debt,kk,2nd_language,capital
a,M,1500000.0,1900,1.0,1500000.0,E,True
b,M,2000000.0,1950,0.0,2000000.0,E,True
c,M,3000000.0,2000,2.0,3000000.0,F,True
d,B,500000.0,1900,0.5,500000.0,F,False
e,B,1500000.0,2000,0.7,1500000.0,G,False


#========================================================================

### Indexing, selection, and filtering

The key here is that we can build boolean Series that we can use to index the original Series or DataFrame. Those booleans can be combined with bitwise boolean operators (&, |, ~) to get filters that are as complex as we need. 

In [77]:
s3

province
B    3000000.0
M    6000000.0
P     120000.0
V     700000.0
Name: population_2000, dtype: float64

In [78]:
s3[['V', 'M']]

province
V     700000.0
M    6000000.0
Name: population_2000, dtype: float64

In [79]:
s3[2:]# tambn puedo usar iloc -> COMPROBAR

province
P    120000.0
V    700000.0
Name: population_2000, dtype: float64

In [80]:
s3['P':'V']

province
P    120000.0
V    700000.0
Name: population_2000, dtype: float64

In [81]:
s3 > 1e06

province
B     True
M     True
P    False
V    False
Name: population_2000, dtype: bool

In [82]:
s3[s3>1e06]# esto seria un filtro

province
B    3000000.0
M    6000000.0
Name: population_2000, dtype: float64

In [83]:
df3

Unnamed: 0,province,population,year,debt,kk,2nd_language
a,M,1500000.0,1900,1.0,1500000.0,E
b,M,2000000.0,1950,0.0,2000000.0,E
c,M,3000000.0,2000,2.0,3000000.0,F
d,B,500000.0,1900,0.5,500000.0,F
e,B,1500000.0,2000,0.7,1500000.0,G


In [84]:
df3[df3['year'] > 1950]

Unnamed: 0,province,population,year,debt,kk,2nd_language
c,M,3000000.0,2000,2.0,3000000.0,F
e,B,1500000.0,2000,0.7,1500000.0,G


In [85]:
df3[(df3['year'] > 1900) | (df3['debt'] > 1)]

Unnamed: 0,province,population,year,debt,kk,2nd_language
b,M,2000000.0,1950,0.0,2000000.0,E
c,M,3000000.0,2000,2.0,3000000.0,F
e,B,1500000.0,2000,0.7,1500000.0,G


In [86]:
recent = df3['year'] > 1900
indebted = df3['debt'] > 1

df3[recent & indebted]

Unnamed: 0,province,population,year,debt,kk,2nd_language
c,M,3000000.0,2000,2.0,3000000.0,F


In [87]:
df3[df3['year'] > 1900][df3['debt'] > 1]
# asi podemos concatenar filtros o selecciones, primero se hace un filtro y despues se filtra sobre ello

  df3[df3['year'] > 1900][df3['debt'] > 1]


Unnamed: 0,province,population,year,debt,kk,2nd_language
c,M,3000000.0,2000,2.0,3000000.0,F


### Function application and mapping

Function application and mapping allows us to modify the elements of a DataFrame (columns with apply or elements with applymap) without for loops. This way we are not constrained to the functions already implemented by pandas or numpy.

In [88]:
df3.drop('kk',inplace=True,axis=1)

In [89]:
df3

Unnamed: 0,province,population,year,debt,2nd_language
a,M,1500000.0,1900,1.0,E
b,M,2000000.0,1950,0.0,E
c,M,3000000.0,2000,2.0,F
d,B,500000.0,1900,0.5,F
e,B,1500000.0,2000,0.7,G


In [90]:
np.sqrt(df3['population'])

a    1224.744871
b    1414.213562
c    1732.050808
d     707.106781
e    1224.744871
Name: population, dtype: float64

In [4]:
import pandas as pd
import numpy as np
df4 = pd.DataFrame(np.random.randn(4,3) * 17 + 15, columns=list('bde'), index=list('BMPZ'))
df4

Unnamed: 0,b,d,e
B,5.430611,10.364543,-13.407707
M,38.433092,28.009542,15.98934
P,4.307692,23.695951,9.777104
Z,5.113174,30.430614,7.172912


In [5]:
np.abs(df4)

Unnamed: 0,b,d,e
B,5.430611,10.364543,13.407707
M,38.433092,28.009542,15.98934
P,4.307692,23.695951,9.777104
Z,5.113174,30.430614,7.172912


This is a typical use case for lambdas (anonymous functions)

In [6]:
def my_range(series):
    return series.max()-series.min()

In [7]:
def min_max(series):
    return [series.min(),series.max()]

The function for "apply" is expecting a series as input and an object as output.

In [8]:
df4.apply(my_range)# apply aplica la funcion a cada una de las columnas

b    34.125401
d    20.066070
e    29.397048
dtype: float64

In [9]:
df4.apply(lambda col: col.max() - col.min())

b    34.125401
d    20.066070
e    29.397048
dtype: float64

In [12]:
df4.applymap(lambda element: element % 10 )# igual q apply pero esto lo aplica sobre todos los elementos

Unnamed: 0,b,d,e
B,5.430611,0.364543,6.592293
M,8.433092,8.009542,5.98934
P,4.307692,3.695951,9.777104
Z,5.113174,0.430614,7.172912


In [11]:
df4.apply(lambda series: series.max() - series.min(), axis=1)# tambn puedo usar apply sobre columnas si pongo el axis

B    23.772251
M    22.443752
P    19.388259
Z    25.317440
dtype: float64

In [18]:
def f(series):
    return pd.Series([series.max(), series.min()], index=['max', 'min'])

df4.apply(f)

Unnamed: 0,b,d,e
max,38.433092,30.430614,15.98934
min,4.307692,10.364543,-13.407707


In [None]:
df4.apply(min_max)

In [None]:
df4.apply(lambda series: pd.Series([series.max(), series.min()], index=['max', 'min']))
         #lambda  input: return -> input = series (valor de entrada de la funcion), 
         #                         return = pd.Series([series.max(), series.min()], index=['max', 'min']) (return de la funcion)

In [None]:
# como iterar en los elementos de un dataframe -> items() y iteritems()

In [15]:
for item in df4.items():
    print(item)

('b', B     5.430611
M    38.433092
P     4.307692
Z     5.113174
Name: b, dtype: float64)
('d', B    10.364543
M    28.009542
P    23.695951
Z    30.430614
Name: d, dtype: float64)
('e', B   -13.407707
M    15.989340
P     9.777104
Z     7.172912
Name: e, dtype: float64)


In [16]:
for item in df4.iteritems():
    print(item)

('b', B     5.430611
M    38.433092
P     4.307692
Z     5.113174
Name: b, dtype: float64)
('d', B    10.364543
M    28.009542
P    23.695951
Z    30.430614
Name: d, dtype: float64)
('e', B   -13.407707
M    15.989340
P     9.777104
Z     7.172912
Name: e, dtype: float64)


In [19]:
map(f, [1,2])#aplicar la funcion sobre lo que queramos, pero para dataframes se usa applymap

<map at 0x7fe2e08a0490>

In [20]:
def format_2digits(number):
    return '%.2f' % number

In [21]:
df4.applymap(format_2digits)

Unnamed: 0,b,d,e
B,5.43,10.36,-13.41
M,38.43,28.01,15.99
P,4.31,23.7,9.78
Z,5.11,30.43,7.17


In [22]:
df4

Unnamed: 0,b,d,e
B,5.430611,10.364543,-13.407707
M,38.433092,28.009542,15.98934
P,4.307692,23.695951,9.777104
Z,5.113174,30.430614,7.172912


### Sorting and ranking

In [23]:
df4.sort_index(ascending=False)

Unnamed: 0,b,d,e
Z,5.113174,30.430614,7.172912
P,4.307692,23.695951,9.777104
M,38.433092,28.009542,15.98934
B,5.430611,10.364543,-13.407707


In [24]:
df4.sort_index(ascending=False, axis=1)

Unnamed: 0,e,d,b
B,-13.407707,10.364543,5.430611
M,15.98934,28.009542,38.433092
P,9.777104,23.695951,4.307692
Z,7.172912,30.430614,5.113174


In [25]:
df4.sort_values(by='e')#ordenamos para una columna

Unnamed: 0,b,d,e
B,5.430611,10.364543,-13.407707
Z,5.113174,30.430614,7.172912
P,4.307692,23.695951,9.777104
M,38.433092,28.009542,15.98934


In [26]:
df4.sort_values(by=['e','b'])#ordenamos por dos columnas, primero una y luego otra

Unnamed: 0,b,d,e
B,5.430611,10.364543,-13.407707
Z,5.113174,30.430614,7.172912
P,4.307692,23.695951,9.777104
M,38.433092,28.009542,15.98934


In [27]:
s1 = pd.Series([2,3,8,4,3,2,1], index=list('abcdefg'))
s1

a    2
b    3
c    8
d    4
e    3
f    2
g    1
dtype: int64

In [28]:
s1.sort_values()

g    1
a    2
f    2
b    3
e    3
d    4
c    8
dtype: int64

rank() returns the positions of the elements of the Series in its sorted version. If there are ties, it will take averages.

In [29]:
s1.rank()

a    2.5
b    4.5
c    7.0
d    6.0
e    4.5
f    2.5
g    1.0
dtype: float64

method{‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}, default ‘average’
How to rank the group of records that have the same value (i.e. ties):

average: average rank of the group

min: lowest rank in the group

max: highest rank in the group

first: ranks assigned in order they appear in the array

dense: like ‘min’, but rank always increases by 1 between groups.

In [None]:
help(pd.Series([1,1,1]).rank())

In [30]:
pd.Series([1,1,1]).rank(method='first')

0    1.0
1    2.0
2    3.0
dtype: float64

In [31]:
s1.rank(method='first')

a    2.0
b    4.0
c    7.0
d    6.0
e    5.0
f    3.0
g    1.0
dtype: float64

In [None]:
pd.Series([1,1,1]).rank()

In [None]:
s2 = pd.Series([30,10,20], index=list('abc'))
s2

s2.rank()

In [None]:
s1.rank(method='dense')

#### Exercise

Write a function that takes a Series and returns the top 10% registers. In this case, earners. Test it with this Series:

```python
salaries = pd.Series([150000, 90000, 120000,30000,10000,5000,40000, 50000, 80000, 35000, 27000,14000, 28000, 22000,25000])
```

In [None]:
salaries = pd.Series([150000, 90000, 120000,30000,10000,5000,40000, 50000, 80000, 35000, 27000,14000, 28000, 22000,25000])

In [None]:
salaries.describe()

In [None]:
0.1 * 15

In [None]:
salaries_order_by_values = salaries.sort_values(ascending=False)
salaries_order_by_values

In [None]:
salaries_order_by_values[:2]

In [None]:
pct = 0.1
num_elem = len(salaries)
n = np.round(num_elem * pct)
n

In [None]:
pct = 0.1
num_elem = len(salaries)
n = int(np.round(num_elem * pct))
    
def take_series(series):    
    salaries_order_by_values = series.sort_values(ascending=False)
    salaries_pct = salaries_order_by_values[:n]
    return salaries_pct

In [None]:
take_series(salaries)

In [None]:
salaries.rank(pct=True)>0.9

# otra forma mas rapida

## Summarizing and computing descriptive statistics

In [32]:
x = pd.Series([1.2, np.nan, 4, np.nan, 9], index=list('abcde'))
y = pd.Series([5, 3, 7, np.nan, 14], index=list('abcde'))

df = pd.DataFrame([x, y], index=['x','y']).T
df

Unnamed: 0,x,y
a,1.2,5.0
b,,3.0
c,4.0,7.0
d,,
e,9.0,14.0


In [33]:
df.sum()

x    14.2
y    29.0
dtype: float64

As with many methods, we can use them in the direction perpendicular to their default.

In [34]:
df.sum(axis=1)

a     6.2
b     3.0
c    11.0
d     0.0
e    23.0
dtype: float64

In [35]:
df.sum(axis=1, skipna=False)

a     6.2
b     NaN
c    11.0
d     NaN
e    23.0
dtype: float64

In [36]:
df.mean()

x    4.733333
y    7.250000
dtype: float64

In [37]:
df.mean(axis=1)

a     3.1
b     3.0
c     5.5
d     NaN
e    11.5
dtype: float64

In [38]:
df.cumsum()

Unnamed: 0,x,y
a,1.2,5.0
b,,8.0
c,5.2,15.0
d,,
e,14.2,29.0


In [39]:
df.std()

x    3.951371
y    4.787136
dtype: float64

In [40]:
df.describe()

Unnamed: 0,x,y
count,3.0,4.0
mean,4.733333,7.25
std,3.951371,4.787136
min,1.2,3.0
25%,2.6,4.5
50%,4.0,6.0
75%,6.5,8.75
max,9.0,14.0


In [41]:
df['x'].sum()

14.2

In [42]:
df['x'].describe()

count    3.000000
mean     4.733333
std      3.951371
min      1.200000
25%      2.600000
50%      4.000000
75%      6.500000
max      9.000000
Name: x, dtype: float64

### Unique values, value counts, and membership

In [43]:
s7 = pd.Series(list('gtcaaagcttcga'))
s7

0     g
1     t
2     c
3     a
4     a
5     a
6     g
7     c
8     t
9     t
10    c
11    g
12    a
dtype: object

In [44]:
s7.unique()

array(['g', 't', 'c', 'a'], dtype=object)

In [45]:
set(s7)# otra forma de sacar los valores unicos es con set -> SOLO VALE PARA ESTO

{'a', 'c', 'g', 't'}

In [46]:
s7.value_counts()

a    4
g    3
t    3
c    3
dtype: int64

In [None]:
#isin es muy util porque te devuelve una serie de booleanos

In [47]:
puric_bases = ['a','g']
s7.isin(puric_bases)

0      True
1     False
2     False
3      True
4      True
5      True
6      True
7     False
8     False
9     False
10    False
11     True
12     True
dtype: bool

In [None]:
s7[s7.isin(puric_bases)]

## Handling missing data

In [48]:
string_data = pd.Series(['Ma', 'Lu', 'Ca', 'Va', np.nan])
string_data

0     Ma
1     Lu
2     Ca
3     Va
4    NaN
dtype: object

In [49]:
string_data[string_data!=np.nan]

0     Ma
1     Lu
2     Ca
3     Va
4    NaN
dtype: object

In [50]:
# como hacemos antes no se puede porq un nan por deficion es distinto a si mismo
# si quiero hacerlo tendria que hacerlo de esta forma:
string_data[string_data==string_data]


0    Ma
1    Lu
2    Ca
3    Va
dtype: object

In [51]:
# o si quiero el nan:
string_data[string_data!=string_data]

4    NaN
dtype: object

This is weird... but it has some really good reasons. You can find explanations [here](https://stackoverflow.com/questions/10034149/why-is-nan-not-equal-to-nan) and [here](https://stackoverflow.com/questions/1565164/what-is-the-rationale-for-all-comparisons-returning-false-for-ieee754-nan-values)

In [52]:
a = np.nan
a==a

False

In [53]:
np.nan == np.nan

False

In [None]:
# esto es lo que deciamos de la definicion de NaN, un elemento distinto a si mismo

In [None]:
string_data[~string_data.isnull()]

### Filtering out missing data

In [54]:
string_data[string_data.notnull()]

0    Ma
1    Lu
2    Ca
3    Va
dtype: object

In [55]:
df5 = pd.DataFrame([[1,2,3], 
                    [np.nan, 8, 7], 
                    [4, np.nan, 90], 
                    [67,42,53]], 
                   columns=list('abc'))
df5

Unnamed: 0,a,b,c
0,1.0,2.0,3
1,,8.0,7
2,4.0,,90
3,67.0,42.0,53


In [56]:
df5[df5['a'].notnull()]
# la condicion es: df5['a'].notnull(), que devueve True o False, y con eso nos devuelve los que han dado True

Unnamed: 0,a,b,c
0,1.0,2.0,3
2,4.0,,90
3,67.0,42.0,53


In [57]:
df5.notnull()

Unnamed: 0,a,b,c
0,True,True,True
1,False,True,True
2,True,False,True
3,True,True,True


any() and all() are functions of boolean Series. They reduce the Series to a single boolean value by applying repeatedly the operators "or" and "and", respectively.

In [58]:
df5.notnull().any()

a    True
b    True
c    True
dtype: bool

In [59]:
df5.notnull().all()

a    False
b    False
c     True
dtype: bool

In [60]:
df5.isnull().any()

a     True
b     True
c    False
dtype: bool

In [61]:
df5.dropna()
#eleminar los NaN de un dataframe, pero hay que tener cuidado porq si lo hacemos a lo bestia con cargamos la fila entera

Unnamed: 0,a,b,c
0,1.0,2.0,3
3,67.0,42.0,53


In [62]:
df5

Unnamed: 0,a,b,c
0,1.0,2.0,3
1,,8.0,7
2,4.0,,90
3,67.0,42.0,53


In [63]:
df5.dropna(axis=1)# y como siempre podemos hacerlo tambn por columnas en lugar de por filas

Unnamed: 0,c
0,3
1,7
2,90
3,53


In [64]:
array = np.random.randn(8,3) * 20 + 100

df6 = pd.DataFrame(array, columns=list('xyz'), index=list('abcdefgh'))
df6.iloc[2:5, 1] = np.nan
df6.iloc[1:3, 2] = np.nan
df6

Unnamed: 0,x,y,z
a,120.962697,96.800794,73.073566
b,112.164289,96.177305,
c,104.17987,,
d,112.109612,,129.087908
e,108.497457,,93.448621
f,90.680109,116.276976,104.482277
g,154.79229,75.587317,100.466571
h,61.572992,76.576794,98.011215


The thresh argument specifies the minimum number of non-null values required to keep a column (or row, with axis=1)

In [65]:
df6.dropna(thresh=2)

Unnamed: 0,x,y,z
a,120.962697,96.800794,73.073566
b,112.164289,96.177305,
d,112.109612,,129.087908
e,108.497457,,93.448621
f,90.680109,116.276976,104.482277
g,154.79229,75.587317,100.466571
h,61.572992,76.576794,98.011215


In [66]:
df6.dropna(thresh=2, axis=1)

Unnamed: 0,x,y,z
a,120.962697,96.800794,73.073566
b,112.164289,96.177305,
c,104.17987,,
d,112.109612,,129.087908
e,108.497457,,93.448621
f,90.680109,116.276976,104.482277
g,154.79229,75.587317,100.466571
h,61.572992,76.576794,98.011215


In [69]:
df6.dropna(thresh=6, axis=1)

Unnamed: 0,x,z
a,120.962697,73.073566
b,112.164289,
c,104.17987,
d,112.109612,129.087908
e,108.497457,93.448621
f,90.680109,104.482277
g,154.79229,100.466571
h,61.572992,98.011215


In [70]:
df6.dropna(subset=['x'])

Unnamed: 0,x,y,z
a,120.962697,96.800794,73.073566
b,112.164289,96.177305,
c,104.17987,,
d,112.109612,,129.087908
e,108.497457,,93.448621
f,90.680109,116.276976,104.482277
g,154.79229,75.587317,100.466571
h,61.572992,76.576794,98.011215


### Filling in missing data

In [None]:
df6.fillna(0)#añade el valor que queramos donde encuentre NaN

In [None]:
df6.fillna({'x' : 100, 'y' : 50, 'z' : 20})#igual pero rellena en funcion de los que le digamos a cada columna

In [None]:
df6

In [71]:
df6.fillna(method='ffill')# lo rellena con el valor que se ha encontrado justo antes

Unnamed: 0,x,y,z
a,120.962697,96.800794,73.073566
b,112.164289,96.177305,73.073566
c,104.17987,96.177305,73.073566
d,112.109612,96.177305,129.087908
e,108.497457,96.177305,93.448621
f,90.680109,116.276976,104.482277
g,154.79229,75.587317,100.466571
h,61.572992,76.576794,98.011215


In [72]:
df6.fillna(method='ffill')

Unnamed: 0,x,y,z
a,120.962697,96.800794,73.073566
b,112.164289,96.177305,73.073566
c,104.17987,96.177305,73.073566
d,112.109612,96.177305,129.087908
e,108.497457,96.177305,93.448621
f,90.680109,116.276976,104.482277
g,154.79229,75.587317,100.466571
h,61.572992,76.576794,98.011215


In [73]:
df6.fillna(df6.median())

Unnamed: 0,x,y,z
a,120.962697,96.800794,73.073566
b,112.164289,96.177305,99.238893
c,104.17987,96.177305,99.238893
d,112.109612,96.177305,129.087908
e,108.497457,96.177305,93.448621
f,90.680109,116.276976,104.482277
g,154.79229,75.587317,100.466571
h,61.572992,76.576794,98.011215


In [None]:
df6.median()

# Additional References

[Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do)

