In [1]:
import numpy as np
import pandas as pd

In [5]:
np.random.randn(20)

array([-0.39823528, -0.96532169, -0.52570468, -0.17572398,  0.51410546,
       -1.06685703, -1.53865127, -1.19545406, -1.17690611,  0.42128141,
       -0.25001667,  1.70953792, -0.90064069,  0.88329734, -0.79598893,
       -1.15617693, -0.20815681,  0.21660305, -0.26946178,  0.31499662])

### This notebook contains:

1. Reindexing
2. Iteration
3. Sorting

#### Reindex:

In [12]:
N=20

df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
   'x': np.linspace(0,stop=N-1,num=N),
   'y': np.random.rand(N),
   'C': np.random.choice(['Low','Medium','High'],N).tolist(),
   'D': np.random.normal(100, 10, size=(N)).tolist()
})


In [13]:
df

Unnamed: 0,A,x,y,C,D
0,2016-01-01,0.0,0.498043,High,91.707868
1,2016-01-02,1.0,0.670991,Medium,103.072072
2,2016-01-03,2.0,0.156646,Medium,97.477883
3,2016-01-04,3.0,0.925508,High,86.466103
4,2016-01-05,4.0,0.42661,Medium,97.829052
5,2016-01-06,5.0,0.206221,Low,94.23649
6,2016-01-07,6.0,0.845332,Medium,93.541017
7,2016-01-08,7.0,0.173912,Medium,89.571298
8,2016-01-09,8.0,0.659497,Low,94.143039
9,2016-01-10,9.0,0.771583,Low,107.694251


In [9]:
df_reindexed = df.reindex(index=[0,2,5], columns=['A', 'C', 'x', 'B'])
df_reindexed

Unnamed: 0,A,C,x,B
0,2016-01-01,Medium,0.0,
2,2016-01-03,Low,2.0,
5,2016-01-06,Medium,5.0,


#### Reindex to Align with Other Objects:
- You may wish to take an object and reindex its axes to be labeled the same as another object. Consider the following example to understand the same.

In [43]:
df1 = pd.DataFrame(np.random.randn(10,3),columns=['col1','col2','col3'])
df2 = pd.DataFrame(np.random.randn(7,3),columns=['col1','col2','col3'])

In [44]:
print(df1)
print('\n',df2)

       col1      col2      col3
0  1.308366 -1.528125 -0.079834
1  1.201075 -1.103043  0.457057
2  0.075997  0.668486 -0.386426
3  0.500540 -0.868716 -0.338443
4  1.199710  2.033317  1.208576
5  1.010220  0.728021 -1.919534
6  0.180958  1.826585  0.629860
7 -1.410900  0.653569  0.714591
8 -0.638579 -0.812091 -2.273840
9  0.250929  0.108451 -0.240414

        col1      col2      col3
0  1.845962  1.854167  0.142560
1 -0.600763  1.785330  0.203487
2  1.367148  1.788548  0.570978
3  1.156453  1.298873  2.003049
4 -1.838076 -0.199573  0.685970
5  1.886359 -0.131227  1.173164
6  0.230933  0.926423  0.429071


In [45]:
df1.reindex_like(df2)

Unnamed: 0,col1,col2,col3
0,1.308366,-1.528125,-0.079834
1,1.201075,-1.103043,0.457057
2,0.075997,0.668486,-0.386426
3,0.50054,-0.868716,-0.338443
4,1.19971,2.033317,1.208576
5,1.01022,0.728021,-1.919534
6,0.180958,1.826585,0.62986


- Note − Here, the df1 DataFrame is altered and reindexed like df2. 
- The column names should be matched or else NAN will be added for the entire column label.

#### Filling while ReIndexing:

reindex(): takes an optional parameter method which is a filling method with values as follows −

- pad/ffill − Fill values forward
- bfill/backfill − Fill values backward
- nearest − Fill from the nearest index values

In [50]:
df1 = pd.DataFrame(np.random.randn(6,3),columns=['col1','col2','col3'])
df2 = pd.DataFrame(np.random.randn(2,3),columns=['col1','col2','col3'])

print(df1,'\n')
print(df2)

       col1      col2      col3
0  0.833018 -2.254979  0.607336
1 -1.348292 -0.118565  0.189711
2 -0.303905  0.704835 -0.215613
3  0.538122 -1.505112 -0.171509
4 -0.170281  0.696513  1.303210
5  0.549031  0.102140  1.177211 

       col1      col2      col3
0  0.352769  0.234358  0.001132
1 -0.672503 -0.151957  0.660089


In [51]:
# Padding Nan's
df2.reindex_like(df1)

Unnamed: 0,col1,col2,col3
0,0.352769,0.234358,0.001132
1,-0.672503,-0.151957,0.660089
2,,,
3,,,
4,,,
5,,,


In [54]:
# Filling the NaN's with preceding values
df2.reindex_like(df1, method = 'ffill')    # last 4 rows are padded

Unnamed: 0,col1,col2,col3
0,0.352769,0.234358,0.001132
1,-0.672503,-0.151957,0.660089
2,-0.672503,-0.151957,0.660089
3,-0.672503,-0.151957,0.660089
4,-0.672503,-0.151957,0.660089
5,-0.672503,-0.151957,0.660089


#### Limits on Filling while Reindexing:
- The limit argument provides additional control over filling while reindexing. Limit specifies the maximum count of consecutive matches. Let us consider the following example to understand the same −

In [62]:
print(df1,'\n')
print(df2)

       col1      col2      col3
0  0.833018 -2.254979  0.607336
1 -1.348292 -0.118565  0.189711
2 -0.303905  0.704835 -0.215613
3  0.538122 -1.505112 -0.171509
4 -0.170281  0.696513  1.303210
5  0.549031  0.102140  1.177211 

       col1      col2      col3
0  0.352769  0.234358  0.001132
1 -0.672503 -0.151957  0.660089


In [63]:
# Padding NAN's
df2.reindex_like(df1)

Unnamed: 0,col1,col2,col3
0,0.352769,0.234358,0.001132
1,-0.672503,-0.151957,0.660089
2,,,
3,,,
4,,,
5,,,


In [67]:
# Filling the Nan's with preceding values
df2.reindex_like(df1, method = 'ffill', limit = 2)  # only 3rd, 4th rows are filled because limit is 2

Unnamed: 0,col1,col2,col3
0,0.352769,0.234358,0.001132
1,-0.672503,-0.151957,0.660089
2,-0.672503,-0.151957,0.660089
3,-0.672503,-0.151957,0.660089
4,,,
5,,,


#### Renaming:
- The rename() method allows you to relabel an axis based on some mapping (a dict or Series) or an arbitrary function.

In [68]:
df1 = pd.DataFrame(np.random.rand(4,3), columns = ['Lebron', 'Kobe', 'Jordan'])
df1

Unnamed: 0,Lebron,Kobe,Jordan
0,0.79334,0.85082,0.394686
1,0.410081,0.433451,0.077502
2,0.345829,0.179031,0.31307
3,0.144579,0.612777,0.353816


In [79]:
df1.rename(index = {0: 'scoring', 3: 'assists'})

Unnamed: 0,Lebron,Kobe,Jordan
scoring,0.79334,0.85082,0.394686
1,0.410081,0.433451,0.077502
2,0.345829,0.179031,0.31307
assists,0.144579,0.612777,0.353816


In [82]:
df1.rename(index = {0: 'scoring', 3: 'assists'}, columns = {'Lebron': 'shaq', 'Kobe': 'tim','Jordan': 'dirk'})

Unnamed: 0,shaq,tim,dirk
scoring,0.79334,0.85082,0.394686
1,0.410081,0.433451,0.077502
2,0.345829,0.179031,0.31307
assists,0.144579,0.612777,0.353816


#### Iterations:

Basic iteration (for i in object) produces −

- Series − values
- DataFrame − column names

In [96]:
series = pd.Series(np.random.rand(10))
print(series)

0    0.784595
1    0.982448
2    0.216233
3    0.940872
4    0.028058
5    0.445189
6    0.356673
7    0.825339
8    0.043268
9    0.626996
dtype: float64


In [97]:
# iterating over series outputs values
for i in series:
    print(i)

0.7845949756302487
0.9824481005421983
0.21623289754170505
0.9408723391229071
0.028057650159049374
0.445188920576401
0.35667338072328303
0.8253393939628432
0.04326785565080071
0.6269958068534035


In [85]:
N=20
df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
   'x': np.linspace(0,stop=N-1,num=N),
   'y': np.random.rand(N),
   'C': np.random.choice(['Low','Medium','High'],N).tolist(),
   'D': np.random.normal(100, 10, size=(N)).tolist()
   })
df.head()

Unnamed: 0,A,x,y,C,D
0,2016-01-01,0.0,0.441011,High,126.876135
1,2016-01-02,1.0,0.891528,High,88.974203
2,2016-01-03,2.0,0.410852,High,101.710789
3,2016-01-04,3.0,0.363477,High,95.596814
4,2016-01-05,4.0,0.743511,High,83.073901


In [87]:
# iterating dataframe outputs column names
for i in df:
    print(i)

A
x
y
C
D


To iterate over the rows of the DataFrame, we can use the following functions −

- iteritems() − to iterate over the (key,value) pairs
- iterrows() − iterate over the rows as (index,series) pairs
- itertuples() − iterate over the rows as namedtuples

#### iteritems():
- Iterates over each column as key, value pair with (label as key) and (column value as a Series object).

In [88]:
df = pd.DataFrame(np.random.randn(4,3),columns=['col1','col2','col3'])
df

Unnamed: 0,col1,col2,col3
0,1.215496,1.668803,-0.179922
1,0.696261,-0.967084,-0.150198
2,-0.569769,-0.848942,-0.000369
3,-1.472419,0.538067,0.429168


In [93]:
for i,j in df.iteritems():
    print(i,'\n',j)

col1 
 0    1.215496
1    0.696261
2   -0.569769
3   -1.472419
Name: col1, dtype: float64
col2 
 0    1.668803
1   -0.967084
2   -0.848942
3    0.538067
Name: col2, dtype: float64
col3 
 0   -0.179922
1   -0.150198
2   -0.000369
3    0.429168
Name: col3, dtype: float64


#### iterrows():
- iterrows() returns the iterator yielding each index value along with a series containing the data in each row.

In [100]:
df

Unnamed: 0,col1,col2,col3
0,1.215496,1.668803,-0.179922
1,0.696261,-0.967084,-0.150198
2,-0.569769,-0.848942,-0.000369
3,-1.472419,0.538067,0.429168


In [99]:
# i = row_index, j = row values
for i,j in df.iterrows():
    print(i,'\n',j)

0 
 col1    1.215496
col2    1.668803
col3   -0.179922
Name: 0, dtype: float64
1 
 col1    0.696261
col2   -0.967084
col3   -0.150198
Name: 1, dtype: float64
2 
 col1   -0.569769
col2   -0.848942
col3   -0.000369
Name: 2, dtype: float64
3 
 col1   -1.472419
col2    0.538067
col3    0.429168
Name: 3, dtype: float64


#### itertuples():
- The first element of the tuple will be the row’s corresponding index value, while the remaining values are the row values.

In [101]:
df

Unnamed: 0,col1,col2,col3
0,1.215496,1.668803,-0.179922
1,0.696261,-0.967084,-0.150198
2,-0.569769,-0.848942,-0.000369
3,-1.472419,0.538067,0.429168


In [102]:
for i in df.itertuples():
    print(i)

Pandas(Index=0, col1=1.2154958017781943, col2=1.6688027059098096, col3=-0.17992170590072346)
Pandas(Index=1, col1=0.6962613405789798, col2=-0.9670843780001301, col3=-0.15019813687516287)
Pandas(Index=2, col1=-0.5697691078786499, col2=-0.8489420845452744, col3=-0.00036874642022073005)
Pandas(Index=3, col1=-1.4724189431223151, col2=0.5380671768262403, col3=0.4291678954192638)


#### Sorting

In [106]:
unsorted = pd.DataFrame(np.random.rand(10,2), index = [1,4,2,8,0,7,3,9,5,6], columns = ['col2', 'col1'])
unsorted

Unnamed: 0,col2,col1
1,0.43876,0.151048
4,0.11989,0.873191
2,0.239297,0.525
8,0.590302,0.636905
0,0.690583,0.561937
7,0.475185,0.55975
3,0.149218,0.352278
9,0.576309,0.974716
5,0.617092,0.550997
6,0.762977,0.955398


#### Sorting by Label:
- Using the sort_index() method, by passing the axis arguments and the order of sorting, DataFrame can be sorted.
- By default, sorting is done on row labels in ascending order.

In [113]:
# sorts row index in ascending order
unsorted.sort_index()

Unnamed: 0,col2,col1
0,0.690583,0.561937
1,0.43876,0.151048
2,0.239297,0.525
3,0.149218,0.352278
4,0.11989,0.873191
5,0.617092,0.550997
6,0.762977,0.955398
7,0.475185,0.55975
8,0.590302,0.636905
9,0.576309,0.974716


In [119]:
# to sort the columns
unsorted.sort_index(axis = 1)

Unnamed: 0,col1,col2
1,0.151048,0.43876
4,0.873191,0.11989
2,0.525,0.239297
8,0.636905,0.590302
0,0.561937,0.690583
7,0.55975,0.475185
3,0.352278,0.149218
9,0.974716,0.576309
5,0.550997,0.617092
6,0.955398,0.762977


In [118]:
# to sort in descending order
unsorted.sort_index(ascending = False)

Unnamed: 0,col2,col1
9,0.576309,0.974716
8,0.590302,0.636905
7,0.475185,0.55975
6,0.762977,0.955398
5,0.617092,0.550997
4,0.11989,0.873191
3,0.149218,0.352278
2,0.239297,0.525
1,0.43876,0.151048
0,0.690583,0.561937


#### Sorting by Value:
- sort_values() is the method for sorting by values. 
- It accepts a 'by' argument which will use the column name of the DataFrame with which the values are to be sorted.

In [6]:
unsorted_df = pd.DataFrame({'col1': [9,7,1,4,8,2,1,0,2], 'col2': [7,3,5,8,1,5,9,6,4]})
unsorted_df

Unnamed: 0,col1,col2
0,9,7
1,7,3
2,1,5
3,4,8
4,8,1
5,2,5
6,1,9
7,0,6
8,2,4


In [8]:
unsorted_df.sort_values(by = 'col1', ascending = False)

Unnamed: 0,col1,col2
0,9,7
4,8,1
1,7,3
3,4,8
5,2,5
8,2,4
2,1,5
6,1,9
7,0,6


In [128]:
unsorted_df.sort_values(by = ['col1', 'col2'])

Unnamed: 0,col1,col2
7,0,6
2,1,5
6,1,9
8,2,4
5,2,5
3,4,8
1,7,3
4,8,1
0,9,7
