# Sorting the data

- for sorting in the dataframe we have 2 approches.
    1. `sort_index`:- it wili sort the data based on the index column.
    2. `sort_values`:- It will sort the data based on define columns or list of columns. It is called as composite sorting. 


In [1]:
# Create a data frame for orders.

import json 
import pandas as pd 

In [4]:
file_path = 'E:/Projects/Data_Engineering/Data-Engineering/data/retail_db/schemas.json'
schema = json.load(open(file_path))

# get column details.

def get_column_name(schema, tableName, sortingKey='column_position'):
    column_details = schema[tableName]
    column_details_sort = sorted(column_details, key=lambda col:col[sortingKey])
    return [col['column_name'] for col in column_details_sort]


orders_column_names = get_column_name(schema, 'orders')

In [7]:
orders_data_file_path = 'E:/Projects/Data_Engineering/Data-Engineering/data/retail_db/orders/part-00000'
orders = pd.read_csv(
                        orders_data_file_path,
                        names=orders_column_names
                    )
orders

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


In [8]:
# sorted by Index 

orders.sort_index()

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


In [9]:
# sorted by values.
# here we will sort by order_customer_id.
# By default sorting will be in ascending order we can change it by making ascending flag FALSE in the argument.

orders.sort_values('order_customer_id') 

Unnamed: 0,order_id,order_date,order_customer_id,order_status
22944,22945,2013-12-13 00:00:00.0,1,COMPLETE
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD
...,...,...,...,...
42914,42915,2014-04-16 00:00:00.0,12434,COMPLETE
13543,13544,2013-10-16 00:00:00.0,12434,PENDING
5302,5303,2013-08-26 00:00:00.0,12434,PENDING
41642,41643,2014-04-08 00:00:00.0,12435,PENDING


In [10]:
# in descending order 

orders.sort_values('order_customer_id', ascending= False)

Unnamed: 0,order_id,order_date,order_customer_id,order_status
41642,41643,2014-04-08 00:00:00.0,12435,PENDING
61628,61629,2013-12-21 00:00:00.0,12435,CANCELED
4798,4799,2013-08-23 00:00:00.0,12434,PENDING_PAYMENT
5302,5303,2013-08-26 00:00:00.0,12434,PENDING
1867,1868,2013-08-03 00:00:00.0,12434,CLOSED
...,...,...,...,...
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD


In [11]:
# We can sort the data with two column. In order to do that we need to give the list of the columns for which we want to sort the data.
# Sorting will be done in order we mentions the columns means fist data will sort based on the first mentioned column in the list.
# And again sorting will be done for the next column. 
# order of sorting will be ascending until we change it. 
# We can define the sorting order for all the list element differently by passing the list of True and False.


sorting_column_list = ['order_customer_id', 'order_date']
orders.sort_values(sorting_column_list)

Unnamed: 0,order_id,order_date,order_customer_id,order_status
22944,22945,2013-12-13 00:00:00.0,1,COMPLETE
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
...,...,...,...,...
61776,61777,2013-12-26 00:00:00.0,12434,COMPLETE
42914,42915,2014-04-16 00:00:00.0,12434,COMPLETE
51799,51800,2014-06-14 00:00:00.0,12434,ON_HOLD
61628,61629,2013-12-21 00:00:00.0,12435,CANCELED


In [12]:
# In the below example first data will be sort for the customer id in ascending order and then sorted data will again
# sort for the order date but in descending order beacue we made it FALSE in the list for ascending.

orders.sort_values(sorting_column_list, ascending= [True, False])

Unnamed: 0,order_id,order_date,order_customer_id,order_status
22944,22945,2013-12-13 00:00:00.0,1,COMPLETE
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD
...,...,...,...,...
5302,5303,2013-08-26 00:00:00.0,12434,PENDING
4798,4799,2013-08-23 00:00:00.0,12434,PENDING_PAYMENT
1867,1868,2013-08-03 00:00:00.0,12434,CLOSED
41642,41643,2014-04-08 00:00:00.0,12435,PENDING
