* Represents rectangular table of data and contains an ordered collection of columns. Columns can  have different value types. It has both row and column index. Data is stored as 1 or more 2-D blocks.

In [5]:
from pandas import DataFrame,Series
import numpy as np
import pandas as pd

In [6]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000,2001,2002,2001,2002,2003],
        'pop' : [1.5,1.7,3.6,2.4,2.9,3.2]}

In [47]:
f1 = DataFrame(data)

In [48]:
f1

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [49]:
f1.head() # Select first 5 rows

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [50]:
f1.info() # only prints so we can not assign it to variable.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
state    6 non-null object
year     6 non-null int64
pop      6 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 224.0+ bytes


* We can specify sequence of columns

In [51]:
f2 = DataFrame(data, columns=['year', 'state', 'pop'])

In [52]:
f2

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [53]:
f3 = DataFrame(data, columns = ['year', 'state', 'pop', 'debt'])

In [54]:
f3

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [55]:
f3.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [56]:
f3['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [57]:
f3.state

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

* `frame[column]` will always work.
* `frame.column` only works when the column name is valid python variable name

In [58]:
f5 = DataFrame(data, columns=['year', 'state', 'pop'], index=['one', 'two', 'three', 'four','five', 'six'])

In [59]:
f5

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


--------------

### Index
* We can think as immutable, ordered set.

In [60]:
i1 = pd.Index([2,3,5,7,11])

In [61]:
i1[1]

3

In [62]:
i1.size

5

In [63]:
i1.shape

(5,)

In [64]:
i1.ndim

1

In [65]:
i1.dtype

dtype('int64')

In [66]:
i2 = pd.Index([1,3,5,7,9])

In [67]:
i1

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [68]:
i2

Int64Index([1, 3, 5, 7, 9], dtype='int64')

In [69]:
i1 & i2

Int64Index([3, 5, 7], dtype='int64')

In [70]:
i1 | i2

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

### `loc`
* To retrieve rows by position or name.

In [71]:
f5.loc['three']

year     2002
state    Ohio
pop       3.6
Name: three, dtype: object

In [72]:
f3

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [73]:
f3['debt'] = 16.5

In [74]:
f3

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,16.5
1,2001,Ohio,1.7,16.5
2,2002,Ohio,3.6,16.5
3,2001,Nevada,2.4,16.5
4,2002,Nevada,2.9,16.5
5,2003,Nevada,3.2,16.5


In [75]:
f3['debt'] = np.arange(6.)

In [76]:
f3

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


* Assigning series to column, its label will be realigned exactly to index of DataFrame.
* Assigning list, array to a column, value length must be equal to length of dataframe.

In [77]:
s1 = Series([-1.2,-1.5,-1.7], index=['two', 'four', 'five'])

In [78]:
s1

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [79]:
f5

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


In [80]:
f5['debt'] = s1

In [81]:
f5

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


* Assigning column that does not exist will create new column. `del` will delete column.

In [82]:
del f5['debt']

In [83]:
f5

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


* Column returned via indexing is view on underlying data, not a copy. This any in place modification to the Series will be reflected in DataFrame.
* Column can be explicitly copy using `copy` method of Series.

#### Nested dict

In [84]:
d2 = {'Nevada': {2001 : 2.4, 2002 : 2.9}, 'ohio' : {2000 : 1.5, 2001 : 1.7 , 2002 : 3.6}}

In [85]:
f7 = DataFrame(d2)

In [86]:
f7

Unnamed: 0,Nevada,ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


* Outer dict is column, inner dict keys are row indices.

#### Transpose DataFrame

In [87]:
f7.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
ohio,1.5,1.7,3.6


------------

In [88]:
f5

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


In [89]:
f5.index.name = 'numbers'

In [90]:
f5.columns.name = 'all'

In [91]:
f5

all,year,state,pop
numbers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


In [92]:
f5.values # converting data frame to numpy array

array([[2000, 'Ohio', 1.5],
       [2001, 'Ohio', 1.7],
       [2002, 'Ohio', 3.6],
       [2001, 'Nevada', 2.4],
       [2002, 'Nevada', 2.9],
       [2003, 'Nevada', 3.2]], dtype=object)

![dataframe constructor](images/DataFrame.JPG)

### Index Object
* Responsible for holding the axis labels and other metadata.

In [93]:
s2 = Series(range(3), index=['a', 'b', 'c'])

In [94]:
s2

a    0
b    1
c    2
dtype: int64

In [95]:
i = s2.index

In [96]:
i

Index(['a', 'b', 'c'], dtype='object')

In [97]:
i[1:]

Index(['b', 'c'], dtype='object')

* index object are not mutable

In [98]:
i[2] = 'd' # TypeError

TypeError: Index does not support mutable operations

In [None]:
i2 = pd.Index(np.arange(3))

In [None]:
i2

In [None]:
s3 = Series([1.5, -2.5, 0], index=i2)

In [None]:
s3

In [None]:
s3.index is i2

In [None]:
f3

In [None]:
f3.columns

In [99]:
'debt' in f3.columns

True

![index methods](images/indexMethods.JPG)

### Reindexing
* Create new object with the data conformed to a new index.

In [100]:
s4 = Series([4.5,7.2,-5.3,3.6], index = ['d', 'b', 'a', 'c'])

In [101]:
s4

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [102]:
s5 = s4.reindex(['a', 'b', 'c', 'd'])

In [103]:
s5

a   -5.3
b    7.2
c    3.6
d    4.5
dtype: float64

#### `ffill` Forward fills the values

In [104]:
s6 = pd.Series(['blue', 'purple', 'yellow'], index = [0,2,4])

In [105]:
s6

0      blue
2    purple
4    yellow
dtype: object

In [106]:
s7 = s6.reindex(range(6), method='ffill')

In [107]:
s7

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

* In dataframe `reindex` can alter the row index, columns or both.

In [108]:
f9 = DataFrame(np.arange(9).reshape((3,3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'Califonia'])

In [109]:
f9

Unnamed: 0,Ohio,Texas,Califonia
a,0,1,2
c,3,4,5
d,6,7,8


In [110]:
f9 = f9.reindex(['a', 'b', 'c', 'd'])

In [111]:
f9.reindex(columns=['Texas', 'Utah','Califonia'])

Unnamed: 0,Texas,Utah,Califonia
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [112]:
f9.loc[['a', 'b'], ['Ohio']]

Unnamed: 0,Ohio
a,0.0
b,


![reindex methods](images/reindex.JPG)

In [113]:
s8 = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [114]:
s8

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [115]:
s8.drop('c')

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [116]:
s8.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [117]:
f10 = DataFrame(np.arange(16).reshape((4,4)), index = ['Ohio', 'Colorado', 'Utah', 'New York'], 
                columns=['one', 'two', 'three', 'four'])

In [118]:
f10

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [119]:
f10.drop(['Colorado', 'Utah']) # Drop values from row

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
New York,12,13,14,15


In [120]:
f10.drop('two', axis=1) # Drop value from column

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [121]:
f10.drop(['one', 'three'], axis='columns') # Drop value from column

Unnamed: 0,two,four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


* `drop` can also works in-place.

In [122]:
f10.drop('Utah', inplace = True)

In [123]:
f10

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,12,13,14,15


In [124]:
f10 = DataFrame(np.arange(16).reshape((4,4)), index = ['Ohio', 'Colorado', 'Utah', 'New York'], 
                columns=['one', 'two', 'three', 'four'])

In [125]:
f10

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [126]:
f10['two'] # passing single element selects column

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [127]:
f10[['three', 'two']] # passing list selects columns

Unnamed: 0,three,two
Ohio,2,1
Colorado,6,5
Utah,10,9
New York,14,13


In [128]:
f10[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [129]:
f10[f10['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [130]:
f10 < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [131]:
f10[f10 < 5] = 0

In [132]:
f10

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### `loc` `iloc`
* Instead of `df.loc[:, 'col1']` we can use `df['col1']` to select column. This work for single columns and lists of columns but not for column slices (as it is reserved for rows slices).
* Also same task can be accomplished by `df.col1` (Works for label that does not contain spaces or special characters)

In [259]:
f10.loc['Colorado']

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int32

In [260]:
f10.loc['Colorado', ['two', 'four']]

two     5
four    7
Name: Colorado, dtype: int32

In [261]:
f10.iloc[2,[3,1]]

four    11
two      9
Name: Utah, dtype: int32

In [262]:
f10.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [263]:
f10.iloc[[1,2], [3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [133]:
f10

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [135]:
f10.iloc[:, np.r_[1:3, 0, 3]] # select via combination of slice and stand alone columns.

Unnamed: 0,two,three,one,four
Ohio,0,0,0,0
Colorado,5,6,0,7
Utah,9,10,8,11
New York,13,14,12,15


In [136]:
f10.nunique() # Number of unique values

one      3
two      4
three    4
four     4
dtype: int64

In [137]:
f10.one.quantile(0.75)

9.0

In [265]:
f10.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [266]:
f10.iloc[:, :3][f10.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


![indexing methods](images/indexing.JPG)

In [267]:
f11 = DataFrame(np.arange(9).reshape((3,3)), columns=list('bcd'), 
                index=['Ohio', 'Texas', 'Colorado'])

In [268]:
f12 = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'), 
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [269]:
f11

Unnamed: 0,b,c,d
Ohio,0,1,2
Texas,3,4,5
Colorado,6,7,8


In [270]:
f12

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [271]:
f11 + f12

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


* Alignment is performed on both the rows and the columns

### Arithmetic methods with fill values
* In differently indexed object, we might want to fill with a special value like 0 when axis label is found on one object but not in other

In [272]:
f11.add(f12, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


In [273]:
fill = f11.stack().mean()

In [274]:
fill

4.0

In [275]:
f11.add(f12, fill_value=fill)

Unnamed: 0,b,c,d,e
Colorado,10.0,11.0,12.0,
Ohio,3.0,5.0,6.0,9.0
Oregon,13.0,,14.0,15.0
Texas,9.0,8.0,12.0,12.0
Utah,4.0,,5.0,6.0


![arithmatic methods](images/arithmatic.jpg)

In [276]:
1 / f11

Unnamed: 0,b,c,d
Ohio,inf,1.0,0.5
Texas,0.333333,0.25,0.2
Colorado,0.166667,0.142857,0.125


In [277]:
f11.rdiv(1) # Method starting with r that has argument flipped

Unnamed: 0,b,c,d
Ohio,inf,1.0,0.5
Texas,0.333333,0.25,0.2
Colorado,0.166667,0.142857,0.125


* we can even specify fill_value in reindexing.

In [278]:
f11.reindex(columns=f12.columns, fill_value = 0)

Unnamed: 0,b,d,e
Ohio,0,2,0
Texas,3,5,0
Colorado,6,8,0


-------------

In [279]:
a1 = np.arange(12.).reshape((3,4))

In [280]:
a1

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [281]:
a1 - a1[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [282]:
f12

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [283]:
s9 = f12.iloc[0]

In [284]:
s9

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [285]:
f12 - s9 
# by default arithmatic between DataFrame ans series matches index of series on dataframe's columns

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [286]:
s10 = Series(range(3), index = ['b', 'e', 'f'])

In [287]:
f12 + s10

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


* Index value not found either in Dataframe's column or series's  index the object will be reindexed to form the union.
* To broadcast over the columns, matching on the rows we should use arithmetic methods

In [288]:
s11 = f12['d']

In [289]:
s11

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [290]:
f12

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [291]:
f12.sub(s11, axis = 'index') # match the dataframe's row index. Also axis = 0 work

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [292]:
f13 = DataFrame(np.random.randn(4,3), columns=list('bde'), 
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [293]:
f13

Unnamed: 0,b,d,e
Utah,1.037847,0.393508,-0.158811
Ohio,-0.460552,1.08878,-1.870152
Texas,-0.19889,1.604937,0.712415
Oregon,-0.622282,-0.305594,0.314257


In [294]:
f13.abs()

Unnamed: 0,b,d,e
Utah,1.037847,0.393508,0.158811
Ohio,0.460552,1.08878,1.870152
Texas,0.19889,1.604937,0.712415
Oregon,0.622282,0.305594,0.314257


In [295]:
fun = lambda x: x.max() - x.min()

In [296]:
f13.apply(fun) # invoked on each column

b    1.660129
d    1.910531
e    2.582567
dtype: float64

In [297]:
f13.apply(fun, axis = 'columns') # invoked once per row

Utah      1.196658
Ohio      2.958932
Texas     1.803827
Oregon    0.936538
dtype: float64

In [298]:
def fun1(x):
    return Series([x.min(), x.max()], index=['min', 'max'])

In [299]:
f13.apply(fun1)

Unnamed: 0,b,d,e
min,-0.622282,-0.305594,-1.870152
max,1.037847,1.604937,0.712415


### `applymap`
* element wise function

In [300]:
f13

Unnamed: 0,b,d,e
Utah,1.037847,0.393508,-0.158811
Ohio,-0.460552,1.08878,-1.870152
Texas,-0.19889,1.604937,0.712415
Oregon,-0.622282,-0.305594,0.314257


In [301]:
f13.applymap(abs)

Unnamed: 0,b,d,e
Utah,1.037847,0.393508,0.158811
Ohio,0.460552,1.08878,1.870152
Texas,0.19889,1.604937,0.712415
Oregon,0.622282,0.305594,0.314257


In [302]:
f13.sort_index()

Unnamed: 0,b,d,e
Ohio,-0.460552,1.08878,-1.870152
Oregon,-0.622282,-0.305594,0.314257
Texas,-0.19889,1.604937,0.712415
Utah,1.037847,0.393508,-0.158811


In [303]:
f13.sort_index(axis=1)

Unnamed: 0,b,d,e
Utah,1.037847,0.393508,-0.158811
Ohio,-0.460552,1.08878,-1.870152
Texas,-0.19889,1.604937,0.712415
Oregon,-0.622282,-0.305594,0.314257


In [304]:
f13.sort_index(axis=1, ascending=False)

Unnamed: 0,e,d,b
Utah,-0.158811,0.393508,1.037847
Ohio,-1.870152,1.08878,-0.460552
Texas,0.712415,1.604937,-0.19889
Oregon,0.314257,-0.305594,-0.622282


* We can use values of one or more columns as the sort keys

In [305]:
f13.sort_values(by = 'b') # we can supply parameter `ascending=False/True`

Unnamed: 0,b,d,e
Oregon,-0.622282,-0.305594,0.314257
Ohio,-0.460552,1.08878,-1.870152
Texas,-0.19889,1.604937,0.712415
Utah,1.037847,0.393508,-0.158811


In [306]:
f13.sort_values(by = ['b', 'e']) # sort by multiple columns

Unnamed: 0,b,d,e
Oregon,-0.622282,-0.305594,0.314257
Ohio,-0.460552,1.08878,-1.870152
Texas,-0.19889,1.604937,0.712415
Utah,1.037847,0.393508,-0.158811


### Ranking
* Assigns ranks from one through the number of valid data points in an array

In [307]:
f14 = DataFrame({'b': [4.3,7,-3,2], 'a':[0,1,0,1]})

In [308]:
f14

Unnamed: 0,b,a
0,4.3,0
1,7.0,1
2,-3.0,0
3,2.0,1


In [309]:
f14.rank()

Unnamed: 0,b,a
0,3.0,1.5
1,4.0,3.5
2,1.0,1.5
3,2.0,3.5


In [310]:
f14.rank(axis='columns')

Unnamed: 0,b,a
0,2.0,1.0
1,2.0,1.0
2,1.0,2.0
3,2.0,1.0


In [311]:
f15 = DataFrame(np.random.randn(4,3), index=['a', 'a', 'b', 'b'])

In [312]:
f15

Unnamed: 0,0,1,2
a,1.661425,-0.364101,-0.051599
a,0.067854,-0.298088,0.667973
b,1.390843,-2.323534,0.90751
b,-1.07872,-1.067751,-0.17413


In [313]:
f15.loc['a']

Unnamed: 0,0,1,2
a,1.661425,-0.364101,-0.051599
a,0.067854,-0.298088,0.667973


In [314]:
f16 = DataFrame([[1.4, np.nan], [7.1,-4.5],[np.nan, np.nan], [0.75,-1.3]],
                index=['a','b','c','d'], columns=['one', 'two'])

In [315]:
f16

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [316]:
f16.sum()

one    9.25
two   -5.80
dtype: float64

In [317]:
f16.sum(axis = 'columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

* NaN values excluded unless the entire row is NaN.

In [318]:
f16.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [319]:
f16.idxmax() # return index where such thing occurs

one    b
two    d
dtype: object

In [320]:
f16.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [321]:
f16.describe() # Produce multiple summary statistics for every column. 
# By defualt produces only for numeric columns, if you need for object columne pass option include = ['O']

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


![statistics methods](images/statistics.jpg)

### Iterate

In [322]:
f17 = DataFrame(np.arange(12).reshape((3,4)), columns=['a', 'b', 'c', 'd'], index=['row1', 'row2', 'row3'])

In [323]:
f17

Unnamed: 0,a,b,c,d
row1,0,1,2,3
row2,4,5,6,7
row3,8,9,10,11


In [324]:
for val in f17: # iterate over name of columns
   print(val) 

a
b
c
d


#### `iterrows()`

In [325]:
for label, row in f17.iterrows():
    print(label)
    print(row)

row1
a    0
b    1
c    2
d    3
Name: row1, dtype: int32
row2
a    4
b    5
c    6
d    7
Name: row2, dtype: int32
row3
a     8
b     9
c    10
d    11
Name: row3, dtype: int32


In [326]:
for label, row in f17.iterrows():
    print(label, row['b'])

row1 1
row2 5
row3 9


### Add column

In [327]:
for label, row in f17.iterrows():
     f17.loc[label, "sum"] = row['a'] + row['b'] + row['c']   

In [328]:
f17

Unnamed: 0,a,b,c,d,sum
row1,0,1,2,3,3.0
row2,4,5,6,7,15.0
row3,8,9,10,11,27.0


### `max()`
* `numeric_only = True`
* Find max value for numeric only columns 

### `Series.str.split(n, expand)`
* `n` maximum number of splits allowed
    - `n=1` = method will make single split at first whitespace
* `expand = True`: expand series of list into dataframe. By default it returns series of list 

###  `Series.str.rsplit(n, expand)`
* Allows to split from end of line.

#### `pct_change()`
* Compare every element with its prior element and compute change in percentage.
* By default it is operate on column

In [329]:
c = pd.Series([1,2,3,4,5,4])
c.pct_change()

0         NaN
1    1.000000
2    0.500000
3    0.333333
4    0.250000
5   -0.200000
dtype: float64

### Correlation and covariance
* Covariance is applied on series data. `cov` is used to compute covariance between series object.
* When we apply covariance method to dataframe, iit will compute cov between all the columns

In [330]:
ss1 = pd.Series(np.random.randn(10))
ss2 = pd.Series(np.random.randn(10))

In [331]:
ss1.cov(ss2)

-0.3402992235546995

In [332]:
dff1 = pd.DataFrame(np.random.randn(10,5), columns=['a','b','c','d','e'])

In [333]:
dff1.cov()

Unnamed: 0,a,b,c,d,e
a,1.9184,-0.789379,0.348296,-0.292793,-0.795197
b,-0.789379,1.454336,-0.320595,0.382102,0.240742
c,0.348296,-0.320595,1.368073,-0.495051,0.233062
d,-0.292793,0.382102,-0.495051,0.963001,-0.044814
e,-0.795197,0.240742,0.233062,-0.044814,0.919205


In [334]:
dff1['a'].cov(dff1['b'])

-0.7893787484305493

* Its symmetric matrix

* Correlation shows the linear relationship between any two array of values. There are multiple method to compute correlation like pearson, spearman and kendall.
* If in one array values change how much it changes in other array.

In [335]:
dff1['a'].corr(dff1['b'])

-0.47258890061044684

In [336]:
dff1.corr()

Unnamed: 0,a,b,c,d,e
a,1.0,-0.472589,0.214993,-0.215416,-0.598824
b,-0.472589,1.0,-0.227285,0.322875,0.208216
c,0.214993,-0.227285,1.0,-0.431303,0.207831
d,-0.215416,0.322875,-0.431303,1.0,-0.047631
e,-0.598824,0.208216,0.207831,-0.047631,1.0


#### `corrwith`
* Takes two dataframe and output correlation value of similar name columns

In [337]:
dff3 = pd.DataFrame(np.random.randn(5,3), columns=['A', 'B', 'C'])
dff4 = pd.DataFrame(np.random.randn(5,3), columns=['A', 'B', 'C'])

In [338]:
dff3.corrwith(dff4)

A   -0.594401
B    0.700632
C   -0.317237
dtype: float64

* Reading strings as csv


In [2]:
from io import StringIO # Allows to read string as regular csv

In [3]:
csv_data = \
'''A,B,C,D
1,2,3,4
5,6,,8
10,11,12,'''

In [4]:
temp_df = pd.read_csv(StringIO(csv_data))

In [5]:
temp_df

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,8.0
2,10,11,12.0,


In [344]:
my_df = pd.read_csv('data/sales.csv')

In [345]:
my_df

Unnamed: 0,month,eggs,salt,spam
0,Jan,47,12.0,17
1,Feb,110,50.0,31
2,Mar,221,89.0,72
3,Apr,77,87.0,20
4,May,132,,52
5,Jun,205,60.0,55


In [346]:
my_df.salt > 60

0    False
1    False
2     True
3     True
4    False
5    False
Name: salt, dtype: bool

In [347]:
my_df[my_df.salt > 60] # filtering data

Unnamed: 0,month,eggs,salt,spam
2,Mar,221,89.0,72
3,Apr,77,87.0,20


In [348]:
my_df['bacon'] = [0,0,50,60,70,80]

In [349]:
my_df

Unnamed: 0,month,eggs,salt,spam,bacon
0,Jan,47,12.0,17,0
1,Feb,110,50.0,31,0
2,Mar,221,89.0,72,50
3,Apr,77,87.0,20,60
4,May,132,,52,70
5,Jun,205,60.0,55,80


In [352]:
my_df.loc[:, my_df.all()] # all column which has all values non-zero (read again non-zero, not non-null)

Unnamed: 0,month,eggs,salt,spam
0,Jan,47,12.0,17
1,Feb,110,50.0,31
2,Mar,221,89.0,72
3,Apr,77,87.0,20
4,May,132,,52
5,Jun,205,60.0,55


In [354]:
my_df.loc[:, my_df.any()] # column with at least single non-zero value

Unnamed: 0,month,eggs,salt,spam,bacon
0,Jan,47,12.0,17,0
1,Feb,110,50.0,31,0
2,Mar,221,89.0,72,50
3,Apr,77,87.0,20,60
4,May,132,,52,70
5,Jun,205,60.0,55,80


In [355]:
my_df.loc[:, my_df.isnull().any()] # any value has to be nan

Unnamed: 0,salt
0,12.0
1,50.0
2,89.0
3,87.0
4,
5,60.0


In [356]:
my_df.loc[:, my_df.notnull().all()]

Unnamed: 0,month,eggs,spam,bacon
0,Jan,47,17,0
1,Feb,110,31,0
2,Mar,221,72,50
3,Apr,77,20,60
4,May,132,52,70
5,Jun,205,55,80
