In [59]:
import numpy as np
import pandas as pd

In [60]:
labels = ['a', 'b', 'c']
my_data = [10, 20, 30]
arr = np.array(my_data)
d = {'a':10, 'b':20, 'c':30}

In [61]:
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [62]:
pd.Series(data=my_data, index=labels)

a    10
b    20
c    30
dtype: int64

In [63]:
pd.Series(my_data, labels)

a    10
b    20
c    30
dtype: int64

In [64]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int32

In [65]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [66]:
d

{'a': 10, 'b': 20, 'c': 30}

In [67]:
pd.Series(data=labels)

0    a
1    b
2    c
dtype: object

In [68]:
ser1 = pd.Series([1, 2, 3, 4], ['USA', 'Germany', 'Portugal', 'Japan'])
ser2 = pd.Series([1, 2, 5, 4], ['USA', 'Germany', 'Italy', 'Japan'])

In [69]:
ser1

USA         1
Germany     2
Portugal    3
Japan       4
dtype: int64

In [70]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [71]:
ser1['USA']

1

In [72]:
ser1 + ser2

Germany     4.0
Italy       NaN
Japan       8.0
Portugal    NaN
USA         2.0
dtype: float64

# DataFrames

In [73]:
from numpy.random import randn

In [74]:
np.random.seed(101)

In [75]:
df = pd.DataFrame(randn(5, 4), ['A', 'B', 'C', 'D', 'E'], ['W', 'X', 'Y', 'Z'])

In [76]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [77]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [78]:
type(df['W'])

pandas.core.series.Series

In [79]:
type(df)

pandas.core.frame.DataFrame

In [80]:
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [81]:
df[['W', 'Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [82]:
df['new'] = df['W'] + df['Y']

In [83]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [84]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [85]:
df.drop('new', axis=1, inplace=True)

The `axis` parameter is set to 0 by default and regards to rows. It must be set to 1 if you want to work with columns instead.

If you want to drop rows or columns from the dataframe permanently, the `inplace` parameter must be set to `True`. Otherwise the drop operation will yield a different dataframe so that the original's data is preserved.

In [86]:
df.drop('E')

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [87]:
df.shape

(5, 4)

### Working with Rows and subsets

In [88]:
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [89]:
df.iloc[2] #for seelcting row 'C'

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [90]:
df.loc['B', 'Y']

-0.8480769834036315

In [91]:
df.loc[['A', 'B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


# Conditional selection

In [92]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [93]:
booldf = df > 0

In [94]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [95]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [96]:
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [97]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [98]:
df[df['W'] > 0] #this is the kind of conditional selection that is most commonly used

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [99]:
df[df['Z'] < 0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [100]:
resultdf = df[df['W'] > 0]

In [101]:
resultdf['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [102]:
#or in one step:
df[df['W'] > 0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [103]:
df[df['W'] > 0][['Y', 'X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [105]:
boolser = df['W'] > 0
result = df[boolser]
boolser

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [111]:
mycols = ['Y', 'X']
result[mycols]
#which is equivalent to df[df['W'] > 0][['Y', 'X']] -- but this takes up less memory due to avoiding the usage of variables

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [110]:
df[df['W'] > 0][['Y', 'X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [112]:
df[(df['W'] > 0) and (df['Y'])] #this yields an error:
#The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

Python's normal `and` operator can't take into account a series of boolean values compared to another series of boolean values.

The `and` operator only works with single boolean values on each side, but not with expressions.

You actually have to use the `&` operator for comparing logical expressions.

In [113]:
df[(df['W'] > 0) & (df['Y'])]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [126]:
df[(df['W'] > 0) | (df['Y'])]

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,UK
B,0.651118,-0.319318,-0.848077,0.605965,HU
C,-2.018168,0.740122,0.528813,-0.589001,IT
D,0.188695,-0.758872,-0.933237,0.955057,BE
E,0.190794,1.978757,2.605967,0.683509,NO


# Indeces

In [123]:
df.reset_index() #also doesn't occur in place unless you specify it so

Unnamed: 0,index,W,X,Y,Z,States
0,A,2.70685,0.628133,0.907969,0.503826,UK
1,B,0.651118,-0.319318,-0.848077,0.605965,HU
2,C,-2.018168,0.740122,0.528813,-0.589001,IT
3,D,0.188695,-0.758872,-0.933237,0.955057,BE
4,E,0.190794,1.978757,2.605967,0.683509,NO


In [117]:
'UK HU IT BE NO'.split()

['UK', 'HU', 'IT', 'BE', 'NO']

In [118]:
newind = 'UK HU IT BE NO'.split()

In [119]:
df['States'] = newind #to add a new _column_

In [120]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,UK
B,0.651118,-0.319318,-0.848077,0.605965,HU
C,-2.018168,0.740122,0.528813,-0.589001,IT
D,0.188695,-0.758872,-0.933237,0.955057,BE
E,0.190794,1.978757,2.605967,0.683509,NO


In [124]:
df.set_index('States') #to set a column and its values as indeces, also doesn't accur in place by default

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
UK,2.70685,0.628133,0.907969,0.503826
HU,0.651118,-0.319318,-0.848077,0.605965
IT,-2.018168,0.740122,0.528813,-0.589001
BE,0.188695,-0.758872,-0.933237,0.955057
NO,0.190794,1.978757,2.605967,0.683509


In [125]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,UK
B,0.651118,-0.319318,-0.848077,0.605965,HU
C,-2.018168,0.740122,0.528813,-0.589001,IT
D,0.188695,-0.758872,-0.933237,0.955057,BE
E,0.190794,1.978757,2.605967,0.683509,NO


# Multi-level indeces

In [128]:
#index levels
outside = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
inside = [1, 2, 3, 1, 2, 3]
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [130]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [131]:
df = pd.DataFrame(randn(6, 2), hier_index, ['A', 'B'])

In [132]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [133]:
df.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [134]:
df.loc['G1'].loc[1]

A    0.302665
B    1.693723
Name: 1, dtype: float64

In [136]:
df.index.names = ['Groups', 'Numbers']

In [137]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Numbers,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [141]:
df.loc['G2'].loc[2]['B']

0.07295967531703869

### Cross-section

In [144]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Numbers,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [145]:
df.xs(1, level='Numbers')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502
