Lecture 16 Index Objects

In [1]:
import numpy as np

from pandas import Series,DataFrame

import pandas as pd

In [2]:
#Let's learn/review about Index Objects
my_ser = Series([1,2,3,4],index=['A','B','C','D'])

#Get the index
my_index = my_ser.index

In [3]:
#Show
my_index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [4]:
#Can grab index ranges
my_index[2:]

Index(['C', 'D'], dtype='object')

In [5]:
#What happens if we try to change an index value?
try:
    my_index[0] = 'Z'
except TypeError:
    print("Indexes are immutable")

Indexes are immutable


Lecture 17 Reindexing

In [6]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn

In [7]:
#Lets create a new series
ser1 = Series([1,2,3,4],index=['A','B','C','D'])

In [8]:
#Show
ser1

A    1
B    2
C    3
D    4
dtype: int64

In [9]:
#Call reindex to rearrange the data to a new index
ser2 = ser1.reindex(['A','B','C','D','E','F'])

In [10]:
#Show
ser2

A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
F    NaN
dtype: float64

In [11]:
# We can alos fill in values for new indexes
ser2.reindex(['A','B','C','D','E','F','G'],fill_value=0)

A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
F    NaN
G    0.0
dtype: float64

In [12]:
#Using a particular method for filling values
ser3 = Series(['USA','Mexico','Canada'],index=[0,5,10])

#Show
ser3

0        USA
5     Mexico
10    Canada
dtype: object

In [13]:
#Can use a forward fill for interploating values vetween indices 
ser3.reindex(range(15),method='ffill')

0        USA
1        USA
2        USA
3        USA
4        USA
5     Mexico
6     Mexico
7     Mexico
8     Mexico
9     Mexico
10    Canada
11    Canada
12    Canada
13    Canada
14    Canada
dtype: object

In [14]:
#Reindexing rows, columns or both

#Lets make a datafram ewith some random values
dframe = DataFrame(randn(25).reshape((5,5)),index=['A','B','D','E','F'],columns=['col1','col2','col3','col4','col5'])

#Show
dframe

Unnamed: 0,col1,col2,col3,col4,col5
A,-0.344278,0.137461,-0.958799,1.015198,-0.590886
B,-1.058102,-0.925154,-0.177267,0.704394,1.132233
D,-0.314893,1.404737,0.669112,-1.830567,-0.725002
E,-0.172489,-2.131559,0.501908,-0.650544,-0.337863
F,-0.480472,0.926991,-0.535477,-0.936948,-0.446615


In [15]:
#Notice we forgot 'C' , lets reindex it into dframe
dframe2 = dframe.reindex(['A','B','C','D','E','F'])


In [16]:
#Can also explicitly reindex columns
new_columns = ['col1','col2','col3','col4','col5','col6']

dframe2.reindex(columns=new_columns)

Unnamed: 0,col1,col2,col3,col4,col5,col6
A,-0.344278,0.137461,-0.958799,1.015198,-0.590886,
B,-1.058102,-0.925154,-0.177267,0.704394,1.132233,
C,,,,,,
D,-0.314893,1.404737,0.669112,-1.830567,-0.725002,
E,-0.172489,-2.131559,0.501908,-0.650544,-0.337863,
F,-0.480472,0.926991,-0.535477,-0.936948,-0.446615,


In [17]:
#Reindex quickly using the label-indexing with ix (we'll see this more in the future)

#Show original
dframe

Unnamed: 0,col1,col2,col3,col4,col5
A,-0.344278,0.137461,-0.958799,1.015198,-0.590886
B,-1.058102,-0.925154,-0.177267,0.704394,1.132233
D,-0.314893,1.404737,0.669112,-1.830567,-0.725002
E,-0.172489,-2.131559,0.501908,-0.650544,-0.337863
F,-0.480472,0.926991,-0.535477,-0.936948,-0.446615


In [18]:
dframe.ix[['A','B','C','D','E','F'],new_columns]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,col1,col2,col3,col4,col5,col6
A,-0.344278,0.137461,-0.958799,1.015198,-0.590886,
B,-1.058102,-0.925154,-0.177267,0.704394,1.132233,
C,,,,,,
D,-0.314893,1.404737,0.669112,-1.830567,-0.725002,
E,-0.172489,-2.131559,0.501908,-0.650544,-0.337863,
F,-0.480472,0.926991,-0.535477,-0.936948,-0.446615,


Lecture 18 Drop Entry

In [19]:
ser1 = Series(np.arange(3),index=['a','b','c'])

#Show
ser1

a    0
b    1
c    2
dtype: int32

In [20]:
ser1.drop("b")

a    0
c    2
dtype: int32

In [24]:
dframe1=DataFrame(np.arange(9).reshape(3,3),index=["SF","LA","NY"],columns=["population","Size","Year"])

In [26]:
dframe1

Unnamed: 0,population,Size,Year
SF,0,1,2
LA,3,4,5
NY,6,7,8


In [28]:
dframe1.drop("LA")

Unnamed: 0,population,Size,Year
SF,0,1,2
NY,6,7,8


In [29]:
dframe1

Unnamed: 0,population,Size,Year
SF,0,1,2
LA,3,4,5
NY,6,7,8


In [30]:
dframe3=dframe1.drop("LA")

In [31]:
dframe3

Unnamed: 0,population,Size,Year
SF,0,1,2
NY,6,7,8


In [32]:
dframe

Unnamed: 0,col1,col2,col3,col4,col5
A,-0.344278,0.137461,-0.958799,1.015198,-0.590886
B,-1.058102,-0.925154,-0.177267,0.704394,1.132233
D,-0.314893,1.404737,0.669112,-1.830567,-0.725002
E,-0.172489,-2.131559,0.501908,-0.650544,-0.337863
F,-0.480472,0.926991,-0.535477,-0.936948,-0.446615


In [35]:
dframe.drop("col5",axis=1)

Unnamed: 0,col1,col2,col3,col4
A,-0.344278,0.137461,-0.958799,1.015198
B,-1.058102,-0.925154,-0.177267,0.704394
D,-0.314893,1.404737,0.669112,-1.830567
E,-0.172489,-2.131559,0.501908,-0.650544
F,-0.480472,0.926991,-0.535477,-0.936948


Lecture 19 selecting Entries

In [37]:
#Lets try some Series indexing
ser1 = Series(np.arange(3),index=['A','B','C'])

#multiply all values by 2, to avoid confusion in future
ser1 = 2*ser1

#Show
ser1 

A    0
B    2
C    4
dtype: int32

In [39]:
ser1["B"]

2

In [41]:
ser1[2]

4

In [42]:
ser1[0:3]

A    0
B    2
C    4
dtype: int32

In [43]:
ser1[["A","B"]]

A    0
B    2
dtype: int32

In [45]:
ser1[ser1>3]

C    4
dtype: int32

In [46]:
ser1[ser1>3]=10

In [47]:
ser1

A     0
B     2
C    10
dtype: int32

In [48]:

dframe = DataFrame(np.arange(25).reshape((5,5)),index=['NYC','LA','SF','DC','Chi'],columns=['A','B','C','D','E'])

#Show
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [49]:
dframe["B"]

NYC     1
LA      6
SF     11
DC     16
Chi    21
Name: B, dtype: int32

In [51]:
dframe[["C","E"]]

Unnamed: 0,C,E
NYC,2,4
LA,7,9
SF,12,14
DC,17,19
Chi,22,24


In [53]:
dframe[dframe["C"]>8]

Unnamed: 0,A,B,C,D,E
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [55]:
dframe>10

Unnamed: 0,A,B,C,D,E
NYC,False,False,False,False,False
LA,False,False,False,False,False
SF,False,True,True,True,True
DC,True,True,True,True,True
Chi,True,True,True,True,True


In [61]:
dframe.loc["LA"]

A    5
B    6
C    7
D    8
E    9
Name: LA, dtype: int32

Lecture 20 Data Allignment

In [62]:
ser1 = Series([0,1,2],index=['A','B','C'])
ser2 = Series([3,4,5,6],index=['A','B','C','D'])

In [66]:
ser1+ser2

A    3.0
B    5.0
C    7.0
D    NaN
dtype: float64

In [67]:
dframe1 = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),index=['NYC','LA'])

In [68]:
dframe2 = DataFrame(np.arange(9).reshape(3,3),columns=list('ADC'),index=['NYC','SF','LA'])

In [69]:
dframe1+dframe2

Unnamed: 0,A,B,C,D
LA,8.0,,,
NYC,0.0,,,
SF,,,,


In [71]:
dframe1.add(dframe2,fill_value=0)

Unnamed: 0,A,B,C,D
LA,8.0,3.0,8.0,7.0
NYC,0.0,1.0,2.0,1.0
SF,3.0,,5.0,4.0


In [74]:
ser3 = dframe2.ix[0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [76]:
ser3

A    0
D    1
C    2
Name: NYC, dtype: int32

In [77]:
dframe2-ser3

Unnamed: 0,A,D,C
NYC,0,0,0
SF,3,3,3
LA,6,6,6


In [79]:
dframe2

Unnamed: 0,A,D,C
NYC,0,1,2
SF,3,4,5
LA,6,7,8


Lecture 21 Rank and Sort

In [80]:
ser1 = Series(range(3),index=['C','A','B'])

In [81]:
ser1

C    0
A    1
B    2
dtype: int64

In [84]:
ser1.sort_index()

A    1
B    2
C    0
dtype: int64

In [88]:
ser1.sort_values()

C    0
A    1
B    2
dtype: int64

In [89]:
ser2=Series(randn(10))

In [91]:
ser2

0    0.795121
1   -0.959819
2    1.399124
3   -0.184147
4    0.795492
5   -1.621650
6    0.209891
7    2.633049
8   -0.456619
9    0.529136
dtype: float64

In [94]:
ser2.sort_values()

5   -1.621650
1   -0.959819
8   -0.456619
3   -0.184147
6    0.209891
9    0.529136
0    0.795121
4    0.795492
2    1.399124
7    2.633049
dtype: float64

In [96]:
ser2

0    0.795121
1   -0.959819
2    1.399124
3   -0.184147
4    0.795492
5   -1.621650
6    0.209891
7    2.633049
8   -0.456619
9    0.529136
dtype: float64

In [98]:
ser2.rank()

0     7.0
1     2.0
2     9.0
3     4.0
4     8.0
5     1.0
6     5.0
7    10.0
8     3.0
9     6.0
dtype: float64

Lecture 22  Summary Statistics

b'hello world\n'

[]