# ================ Advanced Indexing ================

In [2]:
import pandas as pd
import numpy as np

## ----------- PIVOT -----------
> Return reshaped DataFrame organized by given index / column values

> Pivot Table is used to summarize and aggregate data inside dataframe

> DataFrame.pivot(*, columns, index=<no_default>, values=<no_default>)

In [6]:
df1 = pd.DataFrame([
    {"date": "5/1/2017", "city": "new york", "temperature": 65, "humidity": 56},
    {"date": "5/2/2017", "city": "new york", "temperature": 66, "humidity": 58},
    {"date": "5/3/2017", "city": "new york", "temperature": 68, "humidity": 60},
    {"date": "5/1/2017", "city": "mumbai", "temperature": 75, "humidity": 80},
    {"date": "5/2/2017", "city": "mumbai", "temperature": 78, "humidity": 83},
    {"date": "5/3/2017", "city": "mumbai", "temperature": 82, "humidity": 85},
    {"date": "5/1/2017", "city": "beijing", "temperature": 80, "humidity": 26},
    {"date": "5/2/2017", "city": "beijing", "temperature": 77, "humidity": 30},
    {"date": "5/3/2017", "city": "beijing", "temperature": 79, "humidity": 35}
])
df1

Unnamed: 0,date,city,temperature,humidity
0,5/1/2017,new york,65,56
1,5/2/2017,new york,66,58
2,5/3/2017,new york,68,60
3,5/1/2017,mumbai,75,80
4,5/2/2017,mumbai,78,83
5,5/3/2017,mumbai,82,85
6,5/1/2017,beijing,80,26
7,5/2/2017,beijing,77,30
8,5/3/2017,beijing,79,35


In [4]:
# df1.set_index('date')

In [76]:
# df1.reset_index(inplace=True)
# df1

In [7]:
df1.pivot(index='date', columns='city')

Unnamed: 0_level_0,temperature,temperature,temperature,humidity,humidity,humidity
city,beijing,mumbai,new york,beijing,mumbai,new york
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
5/1/2017,80,75,65,26,80,56
5/2/2017,77,78,66,30,83,58
5/3/2017,79,82,68,35,85,60


In [8]:
df1.pivot_table(index='date', columns='city')

Unnamed: 0_level_0,humidity,humidity,humidity,temperature,temperature,temperature
city,beijing,mumbai,new york,beijing,mumbai,new york
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
5/1/2017,26.0,80.0,56.0,80.0,75.0,65.0
5/2/2017,30.0,83.0,58.0,77.0,78.0,66.0
5/3/2017,35.0,85.0,60.0,79.0,82.0,68.0


In [9]:
df1.pivot_table(index='date', columns='city', values='temperature')

city,beijing,mumbai,new york
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5/1/2017,80.0,75.0,65.0
5/2/2017,77.0,78.0,66.0
5/3/2017,79.0,82.0,68.0


In [10]:
df1.pivot_table(index='city', columns='date')
# default = aggfunc='avg'

Unnamed: 0_level_0,humidity,humidity,humidity,temperature,temperature,temperature
date,5/1/2017,5/2/2017,5/3/2017,5/1/2017,5/2/2017,5/3/2017
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
beijing,26.0,30.0,35.0,80.0,77.0,79.0
mumbai,80.0,83.0,85.0,75.0,78.0,82.0
new york,56.0,58.0,60.0,65.0,66.0,68.0


##### aggfunc='sum'

In [11]:
df1.pivot_table(index='city', columns='date', aggfunc='sum')

Unnamed: 0_level_0,humidity,humidity,humidity,temperature,temperature,temperature
date,5/1/2017,5/2/2017,5/3/2017,5/1/2017,5/2/2017,5/3/2017
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
beijing,26,30,35,80,77,79
mumbai,80,83,85,75,78,82
new york,56,58,60,65,66,68


##### margins=True

In [12]:
df1.pivot_table(index='city', columns='date', margins=True)

Unnamed: 0_level_0,humidity,humidity,humidity,humidity,temperature,temperature,temperature,temperature
date,5/1/2017,5/2/2017,5/3/2017,All,5/1/2017,5/2/2017,5/3/2017,All
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
beijing,26.0,30.0,35.0,30.333333,80.0,77.0,79.0,78.666667
mumbai,80.0,83.0,85.0,82.666667,75.0,78.0,82.0,78.333333
new york,56.0,58.0,60.0,58.0,65.0,66.0,68.0,66.333333
All,54.0,57.0,60.0,57.0,73.333333,73.666667,76.333333,74.444444


##### Grouper

In [13]:
df1['date'] = pd.to_datetime(df1['date'])

In [14]:
df1.pivot_table(index=pd.Grouper(freq="ME", key="date"), columns='city')

Unnamed: 0_level_0,humidity,humidity,humidity,temperature,temperature,temperature
city,beijing,mumbai,new york,beijing,mumbai,new york
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2017-05-31,30.333333,82.666667,58.0,78.666667,78.333333,66.333333


## ----------- MELT -----------
> Unpivot a DataFrame from wide to long format, optionally leaving identifiers set

> This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (id_vars), while all other columns, considered measured variables (value_vars), are “unpivoted” to the row axis, leaving just two non-identifier columns, ‘variable’ and ‘value'

> DataFrame.melt(id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None, ignore_index=True)

In [21]:
df2 = pd.read_csv('weather.csv')
df2

Unnamed: 0,day,chicago,chennai,berlin
0,Monday,32,75,41
1,Tuesday,30,77,43
2,Wednesday,28,75,45
3,Thursday,22,82,38
4,Friday,30,83,30
5,Saturday,20,81,45
6,Sunday,25,77,47


In [None]:
# id_vars --- col that you want to keep intact
df3 = pd.melt(df2, id_vars=['day'])
df3

Unnamed: 0,day,variable,value
0,Monday,chicago,32
1,Tuesday,chicago,30
2,Wednesday,chicago,28
3,Thursday,chicago,22
4,Friday,chicago,30
5,Saturday,chicago,20
6,Sunday,chicago,25
7,Monday,chennai,75
8,Tuesday,chennai,77
9,Wednesday,chennai,75


In [17]:
df3[df3['variable']=='chicago']

Unnamed: 0,day,variable,value
0,Monday,chicago,32
1,Tuesday,chicago,30
2,Wednesday,chicago,28
3,Thursday,chicago,22
4,Friday,chicago,30
5,Saturday,chicago,20
6,Sunday,chicago,25


In [19]:
df3 = pd.melt(df2, id_vars=['day'], var_name='city', value_name='temperature')
df3

Unnamed: 0,day,city,temperature
0,Monday,chicago,32
1,Tuesday,chicago,30
2,Wednesday,chicago,28
3,Thursday,chicago,22
4,Friday,chicago,30
5,Saturday,chicago,20
6,Sunday,chicago,25
7,Monday,chennai,75
8,Tuesday,chennai,77
9,Wednesday,chennai,75


## ----------- MULTI-INDEX -----------
> A MultiIndex (hierarchical index) allows you to have multiple levels of indexing on rows or columns.

> Think of it like nested dimensions

> pandas.concat(objs, *, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=None)

In [82]:
data = [
    ('USA', 2023), ('USA', 2024),
    ('India', 2023), ('India', 2024)
]
df7 = pd.DataFrame(data)
df7

Unnamed: 0,0,1
0,USA,2023
1,USA,2024
2,India,2023
3,India,2024


In [83]:
index = pd.MultiIndex.from_tuples(
    [("Country", "Year"),
    ("Country", "Year")]
    )
df7 = pd.DataFrame(data, columns=index)
df7

Unnamed: 0_level_0,Country,Country
Unnamed: 0_level_1,Year,Year.1
0,USA,2023
1,USA,2024
2,India,2023
3,India,2024


In [84]:
index = pd.MultiIndex.from_tuples(data, names=["Country", "Year"])
print(index)
data = [100, 120, 80, 90]
df8 = pd.DataFrame({"Sales": data}, index=index)

df8

MultiIndex([(  'USA', 2023),
            (  'USA', 2024),
            ('India', 2023),
            ('India', 2024)],
           names=['Country', 'Year'])


Unnamed: 0_level_0,Unnamed: 1_level_0,Sales
Country,Year,Unnamed: 2_level_1
USA,2023,100
USA,2024,120
India,2023,80
India,2024,90


In [85]:
# All rows for USA
df8.loc["USA"]

Unnamed: 0_level_0,Sales
Year,Unnamed: 1_level_1
2023,100
2024,120


In [None]:
# Specific row
df8.loc[("India", 2024)]

Sales    90
Name: (India, 2024), dtype: int64

In [None]:
# All countries for year 2023
df8.xs(2023, level="Year")

Unnamed: 0_level_0,Sales
Country,Unnamed: 1_level_1
USA,100
India,80


## ----------- STACK -----------
> Return a reshaped DataFrame or Series having a multi-level index with one or more new inner-most levels compared to the current DataFrame

> DataFrame.stack(level=-1, dropna=<no_default>, sort=<no_default>, future_stack=False)

> level = -1 ==> inner-most level

In [23]:
df4 = pd.read_excel('stocks.xlsx', header=[0,1])
df4

Unnamed: 0_level_0,Unnamed: 0_level_0,Price,Price,Price,Price to earnings ratio (P/E),Price to earnings ratio (P/E),Price to earnings ratio (P/E)
Unnamed: 0_level_1,Company,Facebook,Google,Microsoft,Facebook,Google,Microsoft
0,2017-06-05,155,955,66,37.1,32.0,30.31
1,2017-06-06,150,987,69,36.98,31.3,30.56
2,2017-06-07,153,963,62,36.78,31.7,30.46
3,2017-06-08,155,1000,61,36.11,31.2,30.11
4,2017-06-09,156,1012,66,37.07,30.0,31.0


In [24]:
df4_stacked = df4.stack(future_stack=True)
df4_stacked

Unnamed: 0,Unnamed: 1,Unnamed: 0_level_0,Price,Price to earnings ratio (P/E)
0,Company,2017-06-05,,
0,Facebook,NaT,155.0,37.1
0,Google,NaT,955.0,32.0
0,Microsoft,NaT,66.0,30.31
1,Company,2017-06-06,,
1,Facebook,NaT,150.0,36.98
1,Google,NaT,987.0,31.3
1,Microsoft,NaT,69.0,30.56
2,Company,2017-06-07,,
2,Facebook,NaT,153.0,36.78


In [None]:
df4.stack(future_stack=True, level=0)

Unnamed: 0,Unnamed: 1,Company,Facebook,Google,Microsoft
0,Unnamed: 0_level_0,2017-06-05,,,
0,Price,NaT,155.0,955.0,66.0
0,Price to earnings ratio (P/E),NaT,37.1,32.0,30.31
1,Unnamed: 0_level_0,2017-06-06,,,
1,Price,NaT,150.0,987.0,69.0
1,Price to earnings ratio (P/E),NaT,36.98,31.3,30.56
2,Unnamed: 0_level_0,2017-06-07,,,
2,Price,NaT,153.0,963.0,62.0
2,Price to earnings ratio (P/E),NaT,36.78,31.7,30.46
3,Unnamed: 0_level_0,2017-06-08,,,


In [26]:
df4_stacked.unstack()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 0_level_0,Price,Price,Price,Price,Price to earnings ratio (P/E),Price to earnings ratio (P/E),Price to earnings ratio (P/E),Price to earnings ratio (P/E)
Unnamed: 0_level_1,Company,Facebook,Google,Microsoft,Company,Facebook,Google,Microsoft,Company,Facebook,Google,Microsoft
0,2017-06-05,NaT,NaT,NaT,,155.0,955.0,66.0,,37.1,32.0,30.31
1,2017-06-06,NaT,NaT,NaT,,150.0,987.0,69.0,,36.98,31.3,30.56
2,2017-06-07,NaT,NaT,NaT,,153.0,963.0,62.0,,36.78,31.7,30.46
3,2017-06-08,NaT,NaT,NaT,,155.0,1000.0,61.0,,36.11,31.2,30.11
4,2017-06-09,NaT,NaT,NaT,,156.0,1012.0,66.0,,37.07,30.0,31.0


In [27]:
df5 = pd.read_excel('stocks_levels.xlsx', header=[0,1,2])
df5

Unnamed: 0_level_0,Unnamed: 0_level_0,Price Ratios,Price Ratios,Price Ratios,Price Ratios,Price Ratios,Price Ratios,Income Statement,Income Statement,Income Statement,Income Statement,Income Statement,Income Statement
Unnamed: 0_level_1,Unnamed: 0_level_1,Price,Price,Price,Price to earnings ratio (P/E),Price to earnings ratio (P/E),Price to earnings ratio (P/E),Net Sales,Net Sales,Net Sales,Net Profit,Net Profit,Net Profit
Unnamed: 0_level_2,Company,Facebook,Google,Microsoft,Facebook,Google,Microsoft,Facebook,Google,Microsoft,Facebook,Google,Microsoft
0,Q1 2016,155,955,66,37.1,32.0,30.31,2.6,20,18.7,0.8,5.43,4.56
1,Q2 2016,150,987,69,36.98,31.3,30.56,3.1,22,21.3,0.97,5.89,5.1
2,Q3 2016,153,963,62,36.78,31.7,30.46,4.3,24,21.45,1.2,6.1,5.43
3,Q4 2016,155,1000,61,36.11,31.2,30.11,6.7,26,21.88,1.67,6.5,5.89
4,Q1 2017,156,1012,66,37.07,30.0,31.0,8.1,31,22.34,2.03,6.4,6.09


In [28]:
df5.stack(future_stack=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0_level_0,Price Ratios,Price Ratios,Income Statement,Income Statement
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 0_level_1.1,Price,Price to earnings ratio (P/E),Net Sales,Net Profit
0,Company,Q1 2016,,,,
0,Facebook,,155.0,37.1,2.6,0.8
0,Google,,955.0,32.0,20.0,5.43
0,Microsoft,,66.0,30.31,18.7,4.56
1,Company,Q2 2016,,,,
1,Facebook,,150.0,36.98,3.1,0.97
1,Google,,987.0,31.3,22.0,5.89
1,Microsoft,,69.0,30.56,21.3,5.1
2,Company,Q3 2016,,,,
2,Facebook,,153.0,36.78,4.3,1.2


In [30]:
df5.stack(future_stack=True, level=2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0_level_0,Price Ratios,Price Ratios,Income Statement,Income Statement
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 0_level_1.1,Price,Price to earnings ratio (P/E),Net Sales,Net Profit
0,Company,Q1 2016,,,,
0,Facebook,,155.0,37.1,2.6,0.8
0,Google,,955.0,32.0,20.0,5.43
0,Microsoft,,66.0,30.31,18.7,4.56
1,Company,Q2 2016,,,,
1,Facebook,,150.0,36.98,3.1,0.97
1,Google,,987.0,31.3,22.0,5.89
1,Microsoft,,69.0,30.56,21.3,5.1
2,Company,Q3 2016,,,,
2,Facebook,,153.0,36.78,4.3,1.2


In [36]:
df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
                                    index=['cat', 'dog'],
                                    columns=['weight', 'height'])
df_single_level_cols

Unnamed: 0,weight,height
cat,0,1
dog,2,3


In [38]:
df_single_level_cols.stack(future_stack=True)

cat  weight    0
     height    1
dog  weight    2
     height    3
dtype: int64

In [39]:
multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
                                       ('weight', 'pounds')])
multicol1

MultiIndex([('weight',     'kg'),
            ('weight', 'pounds')],
           )

In [40]:
df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
                                    index=['cat', 'dog'],
                                    columns=multicol1)
df_multi_level_cols1

Unnamed: 0_level_0,weight,weight
Unnamed: 0_level_1,kg,pounds
cat,1,2
dog,2,4


In [41]:
df_multi_level_cols1.stack(future_stack=True)

Unnamed: 0,Unnamed: 1,weight
cat,kg,1
cat,pounds,2
dog,kg,2
dog,pounds,4


In [42]:
multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
                                       ('height', 'm')])
df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
                                    index=['cat', 'dog'],
                                    columns=multicol2)
df_multi_level_cols2

Unnamed: 0_level_0,weight,height
Unnamed: 0_level_1,kg,m
cat,1.0,2.0
dog,3.0,4.0


In [49]:
df_multi_level_cols2.stack(-1, future_stack=True)

Unnamed: 0,Unnamed: 1,weight,height
cat,kg,1.0,
cat,m,,2.0
dog,kg,3.0,
dog,m,,4.0


In [44]:
df_multi_level_cols2.stack(0, future_stack=True)

Unnamed: 0,Unnamed: 1,kg,m
cat,weight,1.0,
cat,height,,2.0
dog,weight,3.0,
dog,height,,4.0


In [50]:
df_multi_level_cols2.stack([0, 1], future_stack=True)

cat  weight  kg    1.0
     height  m     2.0
dog  weight  kg    3.0
     height  m     4.0
dtype: float64

## ----------- CROSSTAB -----------
> Compute a simple cross tabulation of two (or more) factors

> Works on 'Contingency tables'/ 'Cross-tabulation' === a type of table in a matrix format that displays the multivariate frequency distribution of the variables  

> pandas.crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False)

In [51]:
df6 = pd.read_excel('survey.xls')
df6

Unnamed: 0,Name,Nationality,Gender,Age,Handedness
0,Kathy,USA,Female,23,Right
1,Linda,USA,Female,18,Right
2,Peter,USA,Male,19,Right
3,John,USA,Male,22,Left
4,Fatima,Bangadesh,Female,31,Left
5,Kadir,Bangadesh,Male,25,Left
6,Dhaval,India,Male,35,Left
7,Sudhir,India,Male,31,Left
8,Parvir,India,Male,37,Right
9,Yan,China,Female,52,Right


In [52]:
pd.crosstab(df6.Nationality, df6.Handedness)

Handedness,Left,Right
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1
Bangadesh,2,0
China,2,1
India,2,1
USA,1,3


In [53]:
pd.crosstab(df6.Gender, df6.Handedness)

Handedness,Left,Right
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,2,3
Male,5,2


In [54]:
pd.crosstab(df6.Nationality, df6.Gender, margins=True)

Gender,Female,Male,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangadesh,1,1,2
China,2,1,3
India,0,3,3
USA,2,2,4
All,5,7,12


In [55]:
pd.crosstab(df6.Nationality, [df6.Gender, df6.Handedness], margins=True)

Gender,Female,Female,Male,Male,All
Handedness,Left,Right,Left,Right,Unnamed: 5_level_1
Nationality,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Bangadesh,1,0,1,0,2
China,1,1,1,0,3
India,0,0,2,1,3
USA,0,2,1,1,4
All,2,3,5,2,12


In [56]:
pd.crosstab([df6.Nationality, df6.Gender], df6.Handedness, margins=True)

Unnamed: 0_level_0,Handedness,Left,Right,All
Nationality,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bangadesh,Female,1,0,1
Bangadesh,Male,1,0,1
China,Female,1,1,2
China,Male,1,0,1
India,Male,2,1,3
USA,Female,0,2,2
USA,Male,1,1,2
All,,7,5,12


In [57]:
# to do percentage at row-level
pd.crosstab(df6.Nationality, df6.Gender, normalize='index')

Gender,Female,Male
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1
Bangadesh,0.5,0.5
China,0.666667,0.333333
India,0.0,1.0
USA,0.5,0.5


In [58]:
pd.crosstab(df6.Nationality, df6.Gender, values=df6.Age, aggfunc = np.average)

Gender,Female,Male
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1
Bangadesh,31.0,25.0
China,55.0,43.0
India,,34.333333
USA,20.5,20.5
