# Pandas Basics II Walkthrough

In [2]:
import numpy as np
import pandas as pd

### Boolean comparisons

Binary comparison methods such as eq, ne, lt, gt, le and ge.<br>
Behaviour is vectorized<br>
np.nan == np.nan returns False

In [3]:
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a','b','c']),
    'two': pd.Series(np.random.randn(4), index=['a','b','c','d']),
    'three': pd.Series(np.random.randn(3), index=['b','c','d'])
})

In [4]:
df2 = df.copy()

In [5]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [6]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


Reductions empty, any(), all(), and bool() provide a way to summarize a boolean result

In [8]:
(df >0).all()

one      False
two      False
three    False
dtype: bool

In [9]:
(df>0).any()

one       True
two       True
three    False
dtype: bool

In [10]:
(df>0).any().any()

True

In [11]:
pd.Series([True]).bool()

True

In [12]:
pd.Series([False]).bool()

False

In [13]:
pd.DataFrame([[True]]).bool()

True

In [14]:
pd.DataFrame([[False]]).bool()

False

### Objects comparison

In [15]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [16]:
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [18]:
(df + df == df * 2).all().all()

False

In [19]:
(df+df == df*2).all()

one      False
two       True
three    False
dtype: bool

In [20]:
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


B/c np.nan != np.nan

In [21]:
(df + df).equals(df*2)

True

### Descriptive statistics

In [22]:
# Aggregation for each column
df.mean(0)

one      0.859778
two      0.006298
three   -0.228552
dtype: float64

In [23]:
# Aggregation for each index
df.mean(1)

a    1.011445
b   -0.205802
c    0.422317
d   -0.376783
dtype: float64

In [24]:
ts_stand = (df-df.mean())/df.std()
ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

### Describe

In [25]:
series = pd.Series(np.random.randn(1000))

In [26]:
series[::2] = np.nan

In [27]:
series.describe()

count    500.000000
mean       0.037538
std        0.982561
min       -2.840320
25%       -0.570099
50%       -0.012676
75%        0.665829
max        2.917202
dtype: float64

In [28]:
frame = pd.DataFrame(np.random.randn(1000,5), columns=['a','b','c','d','e'])

In [29]:
frame.iloc[::2] = np.nan

In [30]:
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,-0.075493,-0.003721,0.029487,-0.019606,0.047021
std,1.049202,0.991421,0.96607,1.0059,0.960998
min,-3.820209,-2.870594,-3.292138,-2.870851,-2.533345
25%,-0.782613,-0.640324,-0.627556,-0.724994,-0.584944
50%,-0.058789,-0.068484,0.014046,-0.053328,0.005571
75%,0.589815,0.634175,0.708706,0.713283,0.70326
max,2.880524,3.212123,2.852934,2.833185,3.314329


In [31]:
s = pd.Series(['a','a','b','a','a',np.nan,'c','d','a'])

In [32]:
s.describe()

count     8
unique    4
top       a
freq      5
dtype: object

### Index of min/max values

In [33]:
s1 = pd.Series(np.random.randn(5))

In [34]:
s1

0    0.484060
1    0.290634
2   -0.081149
3   -0.590166
4   -1.283004
dtype: float64

In [35]:
s1.idxmin(), s1.idxmax()

(4, 0)

In [36]:
df1 = pd.DataFrame(np.random.randn(5,3), columns=['A','B','C'])

In [37]:
df1

Unnamed: 0,A,B,C
0,1.06405,0.999515,0.210281
1,-0.252243,-0.108103,-0.179053
2,0.885041,-1.897723,0.276348
3,-1.097103,0.281372,0.182141
4,-1.086968,-1.364391,-2.513464


In [38]:
df1.idxmin(axis=0)

A    3
B    2
C    4
dtype: int64

In [39]:
df1.idxmax(axis=1)

0    A
1    B
2    A
3    B
4    A
dtype: object

### Iterations

In [40]:
df = pd.DataFrame({
    'col1': np.random.randn(3),
    'col2': np.random.randn(3)}, index=['a','b','c'])

In [41]:
for col in df:
    print(col)

col1
col2


To iterate over DataFrame rows:<br>
    -  items(): over (key, value) pairs
    -  iterrows(): over rows as (index, series) pairs
    -  itertuples(): over rows as namedtuples (faster than iterrows)

### items

In [43]:
df = pd.DataFrame({
    'a': [1,2,3],
    'b': ['a','b','c']
})

In [44]:
for label, ser in df.items():
    print(label)
    print(ser)

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


### iterrows

In [47]:
for row_index, row in df.iterrows():
    print(row_index, row, sep='\n')

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


### itertuple

In [48]:
for row in df.itertuples():
    print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')
