# Basic - Indexing, Labelling and Ordering
We'll be using some data from AirBnB for this example:\
https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data

In [1]:
import pandas as pd

df = pd.read_csv(r'AB_NYC_2019.csv')
df.head(2)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355


## Indexing
So this means a lot of things depending on the context. For Pandas the index is the number of the left, which is the unique value that can identify each row. By default, the index is generated by counting up from zero. But in this data, we can seee that the database index(which is called the primary key) id would also be another good choice.

In [2]:
df2 = df.set_index('id')
df2.head(3)

Unnamed: 0_level_0,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365


In [4]:
df2.name[2539]

'Clean & quiet apt home by the park'

In [6]:
df3 = df.groupby('room_type').mean(numeric_only=True)
df3

Unnamed: 0_level_0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
room_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Entire home/apt,18438180.0,61755930.0,40.728649,-73.960696,211.794246,8.506907,22.842418,1.306578,10.698335,111.920304
Private room,19468930.0,72475140.0,40.729208,-73.942924,89.780973,5.3779,24.112962,1.445209,3.227717,111.203933
Shared room,23003780.0,102624100.0,40.730514,-73.943343,70.127586,6.475,16.6,1.471726,4.662931,162.000862


In [7]:
df3.index

Index(['Entire home/apt', 'Private room', 'Shared room'], dtype='object', name='room_type')

In [8]:
# if you want to reset the index back to original in df3 do this.
df3.reset_index()

Unnamed: 0,room_type,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Entire home/apt,18438180.0,61755930.0,40.728649,-73.960696,211.794246,8.506907,22.842418,1.306578,10.698335,111.920304
1,Private room,19468930.0,72475140.0,40.729208,-73.942924,89.780973,5.3779,24.112962,1.445209,3.227717,111.203933
2,Shared room,23003780.0,102624100.0,40.730514,-73.943343,70.127586,6.475,16.6,1.471726,4.662931,162.000862


In [10]:
df3.reset_index(drop=True)

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,18438180.0,61755930.0,40.728649,-73.960696,211.794246,8.506907,22.842418,1.306578,10.698335,111.920304
1,19468930.0,72475140.0,40.729208,-73.942924,89.780973,5.3779,24.112962,1.445209,3.227717,111.203933
2,23003780.0,102624100.0,40.730514,-73.943343,70.127586,6.475,16.6,1.471726,4.662931,162.000862


## Sorting
I almost always use sort_index after setting it. If i want the df sorted, I commonly use sort_values

In [13]:
df3.sort_index(ascending=False)

Unnamed: 0_level_0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
room_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Shared room,23003780.0,102624100.0,40.730514,-73.943343,70.127586,6.475,16.6,1.471726,4.662931,162.000862
Private room,19468930.0,72475140.0,40.729208,-73.942924,89.780973,5.3779,24.112962,1.445209,3.227717,111.203933
Entire home/apt,18438180.0,61755930.0,40.728649,-73.960696,211.794246,8.506907,22.842418,1.306578,10.698335,111.920304


In [17]:
df.sort_values(['neighbourhood_group', 'host_name']).head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
35913,28522394,The Spot,215277711,Aaron,Bronx,Van Nest,40.83988,-73.86978,Entire home/apt,300,1,0,,,1,365
32010,24991133,From home to home,91554527,Aboubakar,Bronx,Highbridge,40.83413,-73.92918,Private room,50,3,23,2019-04-07,1.64,1,188
4226,2772111,It's very warm and friendly.,14176488,Ada Azra,Bronx,Fordham,40.86705,-73.88545,Shared room,55,7,10,2018-10-13,0.16,1,365


In [18]:
# display unique name in neighbourhood_group
df.neighbourhood_group.unique()

array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
      dtype=object)

In [19]:
# display how many entries in neighbourhood_group
df.neighbourhood_group.value_counts()

neighbourhood_group
Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: count, dtype: int64

In [22]:
# another way to sorting values
df.sort_values(['neighbourhood_group', 'host_name'], ascending=[False, True]).head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
4079,2611458,close to Manhattan country setting,13373889,Aaron,Staten Island,Concord,40.60375,-74.08065,Private room,129,1,40,2018-10-14,0.85,2,86
16714,13370393,Charming.,13373889,Aaron,Staten Island,Concord,40.60556,-74.08274,Entire home/apt,150,7,1,2018-11-04,0.12,2,83
24922,19970350,Newly renovated clean and Cozy Private room,15344412,Abe,Staten Island,New Springville,40.58085,-74.15443,Private room,43,10,0,,,3,89


## Rank
    Like sorting,but with collision detection.

In [25]:
dfp = df.sort_values('price', ascending=False)
dfp[['id', 'host_name', 'price']].head(3)

Unnamed: 0,id,host_name,price
29238,22436899,Jelena,10000
9151,7003697,Kathrine,10000
17692,13894339,Erin,10000


In [29]:
# we set a new column named price_rank with ascending values. More price, means less rank.
dfp['price_rank'] = df.price.rank(method='max', ascending=False)
dfp[['id', 'host_name', 'price', 'price_rank']].head(5)

Unnamed: 0,id,host_name,price,price_rank
29238,22436899,Jelena,10000,3.0
9151,7003697,Kathrine,10000,3.0
17692,13894339,Erin,10000,3.0
6530,4737930,Olson,9999,6.0
40433,31340283,Matt,9999,6.0


## Recap
- set_index
- reset_index
- sort_values
- sort_index
- unique
- value_counts
- rank