In [1]:
import numpy as np
import pandas as pd

# Series

A series can hold a variety of data points, even built-in functions

In [6]:
labels = ['a', 'b', 'c']
data_1 = [10, 20, 30]

In [10]:
pd.Series(data = data_1, index = labels) # data and index are in order, so no need to specify iy

a    10
b    20
c    30
dtype: int64

In [17]:
d = {'animal': 'panda', 'plant': 'maranta', 'mushroom': 'porcino'}

In [23]:
ser1 = pd.Series(d) # passando un dizionario in Series, le chiavi diventano automaticamente l'index

ser1

animal        panda
plant       maranta
mushroom    porcino
dtype: object

In [26]:
ser1['animal'] #to grab an item from a series, pass the index label and it'll return the data point that correstpond to that index label

'panda'

# DataFrame

A DataFrame is basically a bunch of series that share the same index

In [29]:
from numpy.random import randn

In [31]:
np.random.seed(101)

In [77]:
df_1 = pd.DataFrame(randn(5, 4), ['A', 'B', 'C', 'D', 'E'], ['W', 'X', 'Y', 'Z']) #In order: data, index, column

In [78]:
df_1

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


## Selecting columns

In [42]:
df_1['X'] #to grab a column: basically I have a series

A    0.196800
B   -0.156598
C   -0.610259
D   -0.479448
E    1.862864
Name: X, dtype: float64

In [47]:
df_1[['Y', 'Z']] #to grab multiple column I pass a list and I'll get another dataframe

Unnamed: 0,Y,Z
A,-1.136645,0.000366
B,-0.031579,0.649826
C,-0.755325,-0.346419
D,0.558769,1.02481
E,-1.133817,0.610478


In [49]:
df_1['Y + Z'] = df_1['Y'] + df_1['Z']

In [51]:
df_1

Unnamed: 0,W,X,Y,Z,Y + Z
A,-0.993263,0.1968,-1.136645,0.000366,-1.136278
B,1.025984,-0.156598,-0.031579,0.649826,0.618247
C,2.154846,-0.610259,-0.755325,-0.346419,-1.101744
D,0.147027,-0.479448,0.558769,1.02481,1.58358
E,-0.925874,1.862864,-1.133817,0.610478,-0.523339


In [80]:
df_1.drop('X', axis = 1) #axis è di default impostato a 0, e si riferisce alle righe. Impostato a = 1 si riferisce alle colonne

Unnamed: 0,W,Y,Z
A,0.38603,-0.376519,0.230336
B,0.681209,-0.03116,1.939932
C,-1.005187,0.187125,-0.732845
D,-1.38292,0.961458,-2.141212
E,0.992573,-1.04678,1.292765


### inplace = True

in pandas, for many functions to be effective in the actual dataframe, you have to specify:
<br> **inplace = True** (inplace is default set as False for many methods, to avoid losing information

In [56]:
df_1

Unnamed: 0,W,Y,Z,Y + Z
A,-0.993263,-1.136645,0.000366,-1.136278
B,1.025984,-0.031579,0.649826,0.618247
C,2.154846,-0.755325,-0.346419,-1.101744
D,0.147027,0.558769,1.02481,1.58358
E,-0.925874,-1.133817,0.610478,-0.523339


In [59]:
df_1.drop('E')

Unnamed: 0,W,Y,Z,Y + Z
A,-0.993263,-1.136645,0.000366,-1.136278
B,1.025984,-0.031579,0.649826,0.618247
C,2.154846,-0.755325,-0.346419,-1.101744
D,0.147027,0.558769,1.02481,1.58358


In [65]:
df_1.shape #it turns a tuple with the numbers of rows and columns

(5, 4)

## Selecting rows

In [72]:
df_1.loc['A'] #label-based way to grab rows - the rows are series as well

W       -0.993263
Y       -1.136645
Z        0.000366
Y + Z   -1.136278
Name: A, dtype: float64

In [70]:
df_1.iloc[0] #index based way of grabbing rows

W       -0.993263
Y       -1.136645
Z        0.000366
Y + Z   -1.136278
Name: A, dtype: float64

### Selecting a subset of column and row

In [74]:
df_1.loc['A', 'Y']

-1.1366445936091856

In [76]:
df_1.loc[['A', 'C'], ['Y', 'Z']]

Unnamed: 0,Y,Z
A,-1.136645,0.000366
C,-0.755325,-0.346419


## Conditional selection

In [79]:
df_1 > 0

Unnamed: 0,W,X,Y,Z
A,True,True,False,True
B,True,True,False,True
C,False,False,True,False
D,False,True,True,False
E,True,True,False,True


In [82]:
df_1[df_1['W']>0] # it returns a DF with the value that per W are > 0

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
E,0.992573,1.192241,-1.04678,1.292765


In [84]:
df_1_w = df_1[df_1['W']>0]

df_1_w # now I have it stored in a new variable (df_1_w)

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
E,0.992573,1.192241,-1.04678,1.292765


In [91]:
df_1[df_1['W']>0]['X'] # it gaves me the value of X column where W column in > 0

A    2.084019
B    1.035125
E    1.192241
Name: X, dtype: float64

In [93]:
df_1[df_1['W']>0][['Y', 'Z']] # same shit, but asking for two column passing a list

Unnamed: 0,Y,Z
A,-0.376519,0.230336
B,-0.03116,1.939932
E,-1.04678,1.292765


This codes can be broken into multiple lines

<br> If you want to ask for multiple conditions, remeber to not use the basic Python *and* because it can handle just two boolean at a time, not two series of boolean (like it would be with two columns):
<br> INSTEAD: use and *&*
<br> Es:

In [95]:
df_1[(df_1['W']>0) & (df_1['X']>2)] # it gets me the line where W > 0 AND X > 2 at the same time

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336


In [98]:
df_1[(df_1['W']>0) & (df_1['X']>2)][['Y', 'Z']]

Unnamed: 0,Y,Z
A,-0.376519,0.230336


Same with basir *or* that here (for comparing series) is replaced by |
<br> Es:

In [101]:
df_1[(df_1['Y']>0) | (df_1['X']>2)]

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212


# Index

In [106]:
df_1.reset_index() # it give me my index as a column

Unnamed: 0,index,W,X,Y,Z
0,A,0.38603,2.084019,-0.376519,0.230336
1,B,0.681209,1.035125,-0.03116,1.939932
2,C,-1.005187,-0.74179,0.187125,-0.732845
3,D,-1.38292,1.482495,0.961458,-2.141212
4,E,0.992573,1.192241,-1.04678,1.292765


In [107]:
new_index = ['Blue', 'Red', 'Yellow', 'Pink', 'Black'] 

In [111]:
df_1['Color index'] = new_index

df_1

Unnamed: 0,W,X,Y,Z,Color index
A,0.38603,2.084019,-0.376519,0.230336,Blue
B,0.681209,1.035125,-0.03116,1.939932,Red
C,-1.005187,-0.74179,0.187125,-0.732845,Yellow
D,-1.38292,1.482495,0.961458,-2.141212,Pink
E,0.992573,1.192241,-1.04678,1.292765,Black


In [114]:
df_1.set_index('Color index') # to set an existing column as index

Unnamed: 0_level_0,W,X,Y,Z
Color index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Blue,0.38603,2.084019,-0.376519,0.230336
Red,0.681209,1.035125,-0.03116,1.939932
Yellow,-1.005187,-0.74179,0.187125,-0.732845
Pink,-1.38292,1.482495,0.961458,-2.141212
Black,0.992573,1.192241,-1.04678,1.292765


In [116]:
df_1

Unnamed: 0,W,X,Y,Z,Color index
A,0.38603,2.084019,-0.376519,0.230336,Blue
B,0.681209,1.035125,-0.03116,1.939932,Red
C,-1.005187,-0.74179,0.187125,-0.732845,Yellow
D,-1.38292,1.482495,0.961458,-2.141212,Pink
E,0.992573,1.192241,-1.04678,1.292765,Black


## Multi-Index and Index Hierarchy


In [121]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
# this is just to create the database with multi-level index

In [122]:
df_2 = pd.DataFrame(randn(6, 2), hier_index, ['A', 'B'])

df_2

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.641806,-0.9051
G1,2,-0.391157,1.028293
G1,3,-1.972605,-0.866885
G2,1,0.720788,-1.223082
G2,2,1.60678,-1.11571
G2,3,-1.385379,-1.32966


In [129]:
df_2.loc['G1'].loc[2] # with a multi-index DF you have to call .loc multiple times

A   -0.391157
B    1.028293
Name: 2, dtype: float64

In [134]:
df_2.index.names = ['Groups', 'Num']

df_2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.641806,-0.9051
G1,2,-0.391157,1.028293
G1,3,-1.972605,-0.866885
G2,1,0.720788,-1.223082
G2,2,1.60678,-1.11571
G2,3,-1.385379,-1.32966


In [136]:
df_2.loc['G2'].loc[2]['B']

-1.1157099674628352

### Cross-section

In [139]:
df_2.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.641806,-0.9051
2,-0.391157,1.028293
3,-1.972605,-0.866885


In [141]:
df_2.xs(1, level='Num') # to grab elements across multiple sections

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.641806,-0.9051
G2,0.720788,-1.223082


# Missing values

In [5]:
df_mv = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

df_mv

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [11]:
df_mv.dropna() # to drop all the rows with at least 1 NAs (axis=0 by default)

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [12]:
df_mv.dropna(axis=1) # to drop all the columns with a least 1 NAs

Unnamed: 0,C
0,1
1,2
2,3


## Replacing missing values .fillna()

In [15]:
df_mv.fillna(value='FILL')

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,FILL,2
2,FILL,FILL,3


In [17]:
df_mv['A'].fillna(value=df_mv['A'].mean()) # to fill NAs in a column with the mean of values of that column

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64