# Pandas - indexing

## outlines

* index objects
* reindex
* drop
* selecting entries

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
from numpy import random
import webbrowser

In [2]:
# resource
web = 'https://github.com/jmportilla/Udemy-notes/blob/master/Lec%2017%20-Reindexing.ipynb'
webbrowser.open(web)

True

## index objects

In [3]:
my_ser = Series([1,2,3,4,5,6,7],index = [i for i in 'ABCDEFG'])
print(my_ser.index[::2])

# What happens if we try to change an index value?
my_index[0] = 'Z'

Index(['A', 'C', 'E', 'G'], dtype='object')


NameError: name 'my_index' is not defined

## reindex

In [7]:
# reindex
my_ser1 = my_ser.reindex(list('abcdABCD'))
print(my_ser1)

my_ser1 = my_ser.reindex(list('abcdABCD'),fill_value=0)
print(my_ser1)

a    NaN
b    NaN
c    NaN
d    NaN
A    1.0
B    2.0
C    3.0
D    4.0
dtype: float64
a    0
b    0
c    0
d    0
A    1
B    2
C    3
D    4
dtype: int64


In [9]:
# Using a particular method for filling values
ser3 = Series(['USA','Mexico','Canada'],index=[0,5,10])
print(ser3)

ser3.reindex(range(15),method='ffill')

0        USA
5     Mexico
10    Canada
dtype: object


0        USA
1        USA
2        USA
3        USA
4        USA
5     Mexico
6     Mexico
7     Mexico
8     Mexico
9     Mexico
10    Canada
11    Canada
12    Canada
13    Canada
14    Canada
dtype: object

In [10]:
dframe = DataFrame(random.randn(25).reshape((5,5)),index=['A','B','D','E','F'],columns=['col1','col2','col3','col4','col5'])
print(dframe)

# reindex rows
print(dframe.reindex(['A','B','C','D','E','F']))

# reindex columns
new_col = ['col1','col2','col3','col4','col5','col6']
print(dframe.reindex(columns = new_col))

dframe2 = dframe.reindex(['A','B','C','D','E','F'],columns = new_col)
dframe2

       col1      col2      col3      col4      col5
A -0.861665 -0.402682  1.926392 -1.405678  0.722662
B -2.696452 -0.166941  0.155188 -0.646116  0.078719
D -0.804511  0.101769 -1.213501 -0.871649 -0.044645
E  0.731290 -0.406944  0.401388 -1.004399  0.918269
F -0.670621 -0.336149 -1.185226  0.292012  0.483642
       col1      col2      col3      col4      col5
A -0.861665 -0.402682  1.926392 -1.405678  0.722662
B -2.696452 -0.166941  0.155188 -0.646116  0.078719
C       NaN       NaN       NaN       NaN       NaN
D -0.804511  0.101769 -1.213501 -0.871649 -0.044645
E  0.731290 -0.406944  0.401388 -1.004399  0.918269
F -0.670621 -0.336149 -1.185226  0.292012  0.483642
       col1      col2      col3      col4      col5  col6
A -0.861665 -0.402682  1.926392 -1.405678  0.722662   NaN
B -2.696452 -0.166941  0.155188 -0.646116  0.078719   NaN
D -0.804511  0.101769 -1.213501 -0.871649 -0.044645   NaN
E  0.731290 -0.406944  0.401388 -1.004399  0.918269   NaN
F -0.670621 -0.336149 -1.185226  0

Unnamed: 0,col1,col2,col3,col4,col5,col6
A,-0.861665,-0.402682,1.926392,-1.405678,0.722662,
B,-2.696452,-0.166941,0.155188,-0.646116,0.078719,
C,,,,,,
D,-0.804511,0.101769,-1.213501,-0.871649,-0.044645,
E,0.73129,-0.406944,0.401388,-1.004399,0.918269,
F,-0.670621,-0.336149,-1.185226,0.292012,0.483642,


## drop

In [12]:
ser1 = Series(np.arange(3),index = [i for i in 'abc'])
print(ser1)

# drop series
ser1.drop('a')

a    0
b    1
c    2
dtype: int32


b    1
c    2
dtype: int32

In [17]:
# drop dataframe
dframe = DataFrame(random.randn(9).reshape(3,-1),index = ['SF','LA','NY'],columns=['pop','size','year'])
print(dframe)
print(np.where(dframe>0,dframe,0))

print(dframe.drop(['SF','LA'])) # drop row(s)
print(dframe.drop('size',axis = 1)) # drop column(s)

         pop      size      year
SF  0.727150  0.526433 -0.685447
LA  0.399062 -0.877264 -1.429410
NY -1.243379  0.918087 -0.715180
[[0.72715008 0.52643273 0.        ]
 [0.39906228 0.         0.        ]
 [0.         0.91808662 0.        ]]
         pop      size     year
NY -1.243379  0.918087 -0.71518
         pop      year
SF  0.727150 -0.685447
LA  0.399062 -1.429410
NY -1.243379 -0.715180


## selecting entries

resource : https://github.com/jmportilla/Udemy-notes/blob/master/Lec%2019%20-%20Selecting%20Entries.ipynb

In [20]:
# series
ser1 = Series(np.arange(5),index = list('ABCDE'))
ser1 = ser1**2

print(ser1[1])
print(ser1[2:5])
print(ser1['C'])
print(ser1[['A','C','D']]) # by index(es)
print(ser1[ser1 > 5]) # by condition

1
C     4
D     9
E    16
dtype: int32
4
A    0
C    4
D    9
dtype: int32
D     9
E    16
dtype: int32


In [22]:
# dataframe
dframe = DataFrame(np.arange(25).reshape((5,5)),index=['NYC','LA','SF','DC','Chi'],columns=['A','B','C','D','E'])
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [28]:
# select column(s)
print(dframe['B'])
print(dframe.B)
print(dframe[dframe.columns[1]])
print(dframe[['B','D']])

NYC     1
LA      6
SF     11
DC     16
Chi    21
Name: B, dtype: int32
NYC     1
LA      6
SF     11
DC     16
Chi    21
Name: B, dtype: int32
      B   D
NYC   1   3
LA    6   8
SF   11  13
DC   16  18
Chi  21  23
NYC     1
LA      6
SF     11
DC     16
Chi    21
Name: B, dtype: int32


In [30]:
print(dframe[dframe['B'] > 8])
dframe > 10

      A   B   C   D   E
SF   10  11  12  13  14
DC   15  16  17  18  19
Chi  20  21  22  23  24


Unnamed: 0,A,B,C,D,E
NYC,False,False,False,False,False
LA,False,False,False,False,False
SF,False,True,True,True,True
DC,True,True,True,True,True
Chi,True,True,True,True,True


In [33]:
# select by index 
print(dframe.iloc[3])
print(dframe.loc['LA'])

A    15
B    16
C    17
D    18
E    19
Name: DC, dtype: int32
A    5
B    6
C    7
D    8
E    9
Name: LA, dtype: int32


In [37]:
# select specific value
dframe['B'].iloc[3]

16