# Data Indexing and Selection

## Data Selection in Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
#Series as a dictionary
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index = ['a', 'b', 'c', 'd'])

data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [3]:
data['b']

0.5

In [4]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [6]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [7]:
# z = zip(['a', 'b'], [1,2])
# list(z)

In [10]:
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [11]:
#Series as one-dimensional array
#slicing by explicit indexing
data['a' : 'c'] # final index is included

a    0.25
b    0.50
c    0.75
dtype: float64

In [12]:
#slicing by implicit indexing
data[0:2] #final index is excluded

a    0.25
b    0.50
dtype: float64

In [13]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [15]:
# masking

# (data > 0.3) & (data <0.8) #Boolean mask

data[(data > 0.3) & (data <0.8)]

b    0.50
c    0.75
dtype: float64

In [16]:
# Fancing indexing

data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

## Indexers (loc, iloc)

1. **loc** -> this attribute is used to perform indexing and slicing that always refers to the explicit index

2. **iloc** -> this attribute is used to perform indexing and slicing that always refers to the implicit in

In [21]:
data = pd.Series(['a', 'b', 'c'], 
           index = [1,3,5])
data

1    a
3    b
5    c
dtype: object

In [20]:
#explicit index when indexing
data[1]

'a'

In [22]:
#implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

In [24]:
#explicit
data.loc[1]

'a'

In [26]:
#explicit
data.loc[1:5]

1    a
3    b
5    c
dtype: object

In [27]:
data.iloc[1]

'b'

In [30]:
data.iloc[1:4]

3    b
5    c
dtype: object

## Data Selection in DataFrame:

In [32]:
#DataFrame as a Dictionary
population = pd.Series({'Delhi': 36787623,
                        'Mumbai': 45678234,
                        'Bangalore':55678643,
                        'Chennai': 44567412,
                        'Goa': 23455412})

area = pd.Series({'Delhi': 456782,
                  'Mumbai': 459874,
                  'Bangalore':234564,
                  'Chennai': 654345,
                  'Goa': 887345})

data = pd.DataFrame({'area': area, 'population': population})

data

Unnamed: 0,area,population
Delhi,456782,36787623
Mumbai,459874,45678234
Bangalore,234564,55678643
Chennai,654345,44567412
Goa,887345,23455412


In [33]:
data['area']

Delhi        456782
Mumbai       459874
Bangalore    234564
Chennai      654345
Goa          887345
Name: area, dtype: int64

In [36]:
data.area

Delhi        456782
Mumbai       459874
Bangalore    234564
Chennai      654345
Goa          887345
Name: area, dtype: int64

In [37]:
data.area is data['area'] 

True

In [38]:
data

Unnamed: 0,area,population
Delhi,456782,36787623
Mumbai,459874,45678234
Bangalore,234564,55678643
Chennai,654345,44567412
Goa,887345,23455412


In [39]:
data['density'] = data['population'] / data['area'] #ufuncs
data

Unnamed: 0,area,population,density
Delhi,456782,36787623,80.536499
Mumbai,459874,45678234,99.327716
Bangalore,234564,55678643,237.370794
Chennai,654345,44567412,68.10996
Goa,887345,23455412,26.43325


In [40]:
#DataFrame as a two-dimensional array
data.values

array([[4.56782000e+05, 3.67876230e+07, 8.05364988e+01],
       [4.59874000e+05, 4.56782340e+07, 9.93277159e+01],
       [2.34564000e+05, 5.56786430e+07, 2.37370794e+02],
       [6.54345000e+05, 4.45674120e+07, 6.81099603e+01],
       [8.87345000e+05, 2.34554120e+07, 2.64332498e+01]])

In [41]:
#Transpose
data.T

Unnamed: 0,Delhi,Mumbai,Bangalore,Chennai,Goa
area,456782.0,459874.0,234564.0,654345.0,887345.0
population,36787620.0,45678230.0,55678640.0,44567410.0,23455410.0
density,80.5365,99.32772,237.3708,68.10996,26.43325


In [42]:
data.values[0]

array([4.56782000e+05, 3.67876230e+07, 8.05364988e+01])

In [45]:
data

Unnamed: 0,area,population,density
Delhi,456782,36787623,80.536499
Mumbai,459874,45678234,99.327716
Bangalore,234564,55678643,237.370794
Chennai,654345,44567412,68.10996
Goa,887345,23455412,26.43325


In [44]:
data.iloc[:3, :2]

Unnamed: 0,area,population
Delhi,456782,36787623
Mumbai,459874,45678234
Bangalore,234564,55678643


In [46]:
data.loc[:'Bangalore', :'population']

Unnamed: 0,area,population
Delhi,456782,36787623
Mumbai,459874,45678234
Bangalore,234564,55678643


In [48]:
#masking

data[data.density > 100]

Unnamed: 0,area,population,density
Bangalore,234564,55678643,237.370794


In [50]:
#masking and fancy indexing

data.loc[data.density > 100, ['area', 'population']]

Unnamed: 0,area,population
Bangalore,234564,55678643


In [51]:
data

Unnamed: 0,area,population,density
Delhi,456782,36787623,80.536499
Mumbai,459874,45678234,99.327716
Bangalore,234564,55678643,237.370794
Chennai,654345,44567412,68.10996
Goa,887345,23455412,26.43325


In [52]:
data.iloc[0,2] = 90
data

Unnamed: 0,area,population,density
Delhi,456782,36787623,90.0
Mumbai,459874,45678234,99.327716
Bangalore,234564,55678643,237.370794
Chennai,654345,44567412,68.10996
Goa,887345,23455412,26.43325


In [53]:
data['Delhi': 'Chennai']

Unnamed: 0,area,population,density
Delhi,456782,36787623,90.0
Mumbai,459874,45678234,99.327716
Bangalore,234564,55678643,237.370794
Chennai,654345,44567412,68.10996


In [55]:
#indexing refers to columns, slicing refers to rows
data[1:3]

Unnamed: 0,area,population,density
Mumbai,459874,45678234,99.327716
Bangalore,234564,55678643,237.370794


## Operating on Data in Pandas

### Ufuncs: Index Preservation

In [56]:
#Index Alignment

np.random.seed(0)

ser = pd.Series(np.random.randint(0,10,4))
ser

0    5
1    0
2    3
3    3
dtype: int64

In [57]:
df = pd.DataFrame(np.random.randint(0,10,(3,4)),
             columns = ['A', 'B', 'C', 'D'])

df

Unnamed: 0,A,B,C,D
0,7,9,3,5
1,2,4,7,6
2,8,8,1,6


In [58]:
np.exp(ser)

0    148.413159
1      1.000000
2     20.085537
3     20.085537
dtype: float64

In [59]:
np.sin(df)

Unnamed: 0,A,B,C,D
0,0.656987,0.412118,0.14112,-0.958924
1,0.909297,-0.756802,0.656987,-0.279415
2,0.989358,0.989358,0.841471,-0.279415


In [60]:
#Index alignment in Series

population = pd.Series({'Delhi': 36787623,
                        'Mumbai': 45678234,
                        'Bangalore':55678643,
                        'Goa': 23455412})

area = pd.Series({'Mumbai': 459874,
                  'Bangalore':234564,
                  'Chennai': 654345,
                  'Goa': 887345})

In [61]:
population / area

Bangalore    237.370794
Chennai             NaN
Delhi               NaN
Goa           26.433250
Mumbai        99.327716
dtype: float64

In [65]:
A = pd.Series([2,4,6], index = [0,1,2])
B = pd.Series([1,3,5], index = [1,2,3])

A + B 

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [67]:
# Index alignment in DataFrame

A = pd.DataFrame(np.random.randint(0,20,(2,2)),
                 columns = ['A', 'B'])
A

Unnamed: 0,A,B
0,13,8
1,9,19


In [68]:
B = pd.DataFrame(np.random.randint(0,10,(3,3)),
                 columns = ['A', 'B', 'C'])
B

Unnamed: 0,A,B,C
0,0,3,5
1,0,2,3
2,8,1,3


In [69]:
A + B

Unnamed: 0,A,B,C
0,13.0,11.0,
1,9.0,21.0,
2,,,


## Handling Missing Data:

- Python - null, NaN(Not a Number), NA

### Missing Data in Pandas

- In Pandas, we use sentinels(identifiers, indicators) to represent missing data, which are already-existing
    Python null values: <br>
    - the special floating-point **NaN** value
    - Python **None** object
    
- We can't use **None** in any arbitrary NumPy/Pandas array. It can only be used in arrays with data type "object"
  (i.e., array of Python objects)
  
- The dtype = object means the best common type representation NumPy could infer for the contents of the array is that they are Python objects.


In [74]:
for dtype in ['object', 'int']:
    print('dtype: ', dtype)
    %timeit np.arange(100000, dtype = dtype).sum()
    print()

dtype:  object
5.21 ms ± 65.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

dtype:  int
90.4 µs ± 1.36 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)



In [75]:
vals1 = np.array([1, None, 3, 4], dtype = 'object')

vals1

array([1, None, 3, 4], dtype=object)

In [77]:
vals1.sum()

array([1, None, 3, 4], dtype=object)

In [79]:
#Nan: Missing numerical data

vals = np.array([1, np.nan, 3, 4])
vals

array([ 1., nan,  3.,  4.])

In [80]:
vals.dtype

dtype('float64')

In [81]:
1 + np.nan

nan

In [82]:
0 * np.nan

nan

In [83]:
vals.sum(), vals.min(), vals.max()

(nan, nan, nan)

In [85]:
np.nansum(vals), np.nanmin(vals), np.nanmax(vals)

(8.0, 1.0, 4.0)

In [86]:
#Nan and None in Pandas

pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [87]:
x = pd.Series(range(2), dtype = int)
x

0    0
1    1
dtype: int64

In [89]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

--Datatype-------Conversion(Final Array)--------NA sentinel value--<br>
<br>
--- float ----------> No change ---------------------> np.nan
<br>
--- object---------->  No change---------------------> None or np.nan
<br>
---integer----------> float64---------------------> np.nan
<br>
---boolean----------> object---------------------> None or np.nan

In [90]:
#String data is always stored with an object data type(dtype)