In [1]:
import pandas as pd

In [11]:
obj = pd.Series([4, 7, -5, 3]) # a series has two attributes, values and index
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [12]:
obj2['d']

4

In [13]:
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

#### Another way to think about a Series is as a fixed-length, ordered dict, as it is a mapping of index values to data values.

In [18]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [19]:
obj3 = pd.Series(sdata)

In [20]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [23]:
new_states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index = new_states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [27]:
obj5 = pd.Series(obj3, index = new_states)
obj5

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [28]:
obj5.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [29]:
pd.isnull(obj5)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [34]:
obj6 = obj3 + obj5 # Add values for same index
obj6

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [35]:
x = dict(obj6)
x

{'California': nan,
 'Ohio': 70000.0,
 'Oregon': 32000.0,
 'Texas': 142000.0,
 'Utah': nan}

#### Both the Series object itself and its index have a `name` attribute

In [37]:
obj3.index.name = 'States'
obj3.name = 'Population'
obj3

States
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
Name: Population, dtype: int64

- A DataFrame represents a rectangular table of data 
- contains an ordered collection of columns
- each column can be a different value type (numeric, string, boolean, etc.)
- DataFrame has both a row and column index
- DataFrame can be constructed using a dictionary of equal length list or NumPy array.
- DataFrame has three components - values, columns (column index), index (row index)
- Indexes are immutable and hence new value cannot be assigned to any index

In [39]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [41]:
frame.values

array([['Ohio', 2000, 1.5],
       ['Ohio', 2001, 1.7],
       ['Ohio', 2002, 3.6],
       ['Nevada', 2001, 2.4],
       ['Nevada', 2002, 2.9],
       ['Nevada', 2003, 3.2]], dtype=object)

In [42]:
frame.columns

Index(['state', 'year', 'pop'], dtype='object')

In [43]:
frame.index

RangeIndex(start=0, stop=6, step=1)

In [40]:
pd.DataFrame(frame, columns = ['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [48]:
frame2 = pd.DataFrame(data, columns = ['year','state','pop'], index = ['a','b','c','d','e','f'])
frame2

Unnamed: 0,year,state,pop
a,2000,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,3.6
d,2001,Nevada,2.4
e,2002,Nevada,2.9
f,2003,Nevada,3.2


In [49]:
ind2 = frame2.index
ind2

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [51]:
ind2[2] = 'j' # indexes are immutable 

TypeError: Index does not support mutable operations

In [61]:
import numpy as np
frame = pd.DataFrame(np.random.randint(0,15,16).reshape(4,4), columns = list('abcd'))
frame

Unnamed: 0,a,b,c,d
0,3,5,1,5
1,6,13,10,5
2,2,3,2,8
3,2,4,10,1


In [68]:
func = lambda f : f.max() - f.min()

frame.apply(func) # Same as frame.apply(func, axis = 0)

a     4
b    10
c     9
d     7
dtype: int64

In [69]:
frame.apply(func, axis = 1)

0    4
1    8
2    6
3    9
dtype: int64

In [71]:
frame = pd.DataFrame(np.random.randn(16).reshape(4,4), columns = list('abcd'), index = list('pqrs'))
frame

Unnamed: 0,a,b,c,d
p,1.053615,1.226579,-0.242647,-0.516314
q,-0.710173,0.178997,-0.907461,0.414521
r,0.630429,0.658163,-0.062238,2.124874
s,0.044364,-0.526079,0.11838,-1.020644


In [75]:
# mapping a function for all the values in DataFrame
format1 = lambda x : '%.2f' % x

frame.applymap(format1)

Unnamed: 0,a,b,c,d
p,1.05,1.23,-0.24,-0.52
q,-0.71,0.18,-0.91,0.41
r,0.63,0.66,-0.06,2.12
s,0.04,-0.53,0.12,-1.02


In [76]:
# Sorting and Ranking
frame = pd.DataFrame(np.arange(16).reshape(4,4), columns = list('bacd'), index = list('spqr'))
frame

Unnamed: 0,b,a,c,d
s,0,1,2,3
p,4,5,6,7
q,8,9,10,11
r,12,13,14,15


In [77]:
frame.sort_index() 

Unnamed: 0,b,a,c,d
p,4,5,6,7
q,8,9,10,11
r,12,13,14,15
s,0,1,2,3


In [81]:
frame.sort_index(axis = 1, ascending = False) # b, a, c, d is in a row and axis 1 means row.

Unnamed: 0,d,c,b,a
s,3,2,0,1
p,7,6,4,5
q,11,10,8,9
r,15,14,12,13


In [83]:
frame = pd.DataFrame(np.random.randint(0,15,16).reshape(4,4), columns = list('abcd'), index = list('pqrs'))
frame

Unnamed: 0,a,b,c,d
p,11,9,7,0
q,12,7,4,6
r,13,8,8,6
s,2,12,3,14


In [86]:
frame.sort_values(by='a') # by is a necessary argument in sort_values function

Unnamed: 0,a,b,c,d
s,2,12,3,14
p,11,9,7,0
q,12,7,4,6
r,13,8,8,6


In [87]:
frame

Unnamed: 0,a,b,c,d
p,11,9,7,0
q,12,7,4,6
r,13,8,8,6
s,2,12,3,14


In [89]:
frame.rank()  # axis = 0 means across the row, means within a column

Unnamed: 0,a,b,c,d
p,2.0,3.0,3.0,1.0
q,3.0,1.0,2.0,2.5
r,4.0,2.0,4.0,2.5
s,1.0,4.0,1.0,4.0


In [92]:
frame.rank(axis = 1)  # axis = 1 means across the column, means within a row

Unnamed: 0,a,b,c,d
p,4.0,3.0,2.0,1.0
q,4.0,3.0,1.0,2.0
r,4.0,2.5,2.5,1.0
s,1.0,3.0,2.0,4.0


In [95]:
frame.rank(axis = 'columns') # same as axis = 1

Unnamed: 0,a,b,c,d
p,4.0,3.0,2.0,1.0
q,4.0,3.0,1.0,2.0
r,4.0,2.5,2.5,1.0
s,1.0,3.0,2.0,4.0


In [97]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4]) # object with same value is ranked equally
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [98]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64