In [1]:
# As we've seen, both Series and DataFrames can have indices applied to them. The index is essentially a row level label,
# and in pandas the rows correspond to axis zero. Indices can either be either autogenerated, such as when we create a new Series
# without an index, in which case we get numeric values, or they can be set explicitly, like when we use the dictionary object to
# create the series, or when we loaded data from the CSV file and set appropriate parameters. Another option for setting an index
# is to use the set_index() function. This function takes a list of columns and promotes those columns to an index. In this
# lecture we'll explore more about how indexes work in pandas.

In [2]:
# The set_index() function is a destructive process, and it doesn't keep the current index. 
# If you want to keep the current index, you need to manually create a new column and copy into 
# it values from the index attribute.

In [2]:
import pandas as pd
list = [{'name':'Tuba','age': 23, 'CGPA': 3.8},
        {'name':'Plabon','age': 25, 'CGPA': 3.4},
        {'name':'Nila','age': 23, 'CGPA': 3.2},
        {'name':'Maliha','age': 21, 'CGPA': 3.75},
        {'name':'Tara','age': 23, 'CGPA': 2.8},
        {'name':'Raima','age': 22, 'CGPA': 3.65},
        {'name':'Riya','age': 19, 'CGPA': 3.43}]

df = pd.DataFrame(list)
df

Unnamed: 0,name,age,CGPA
0,Tuba,23,3.8
1,Plabon,25,3.4
2,Nila,23,3.2
3,Maliha,21,3.75
4,Tara,23,2.8
5,Raima,22,3.65
6,Riya,19,3.43


In [4]:
# So we copy the indexed data into its own column
df['serial no'] = df.index
df

Unnamed: 0,name,age,CGPA,serial no
0,Tuba,23,3.8,0
1,Plabon,25,3.4,1
2,Nila,23,3.2,2
3,Maliha,21,3.75,3
4,Tara,23,2.8,4
5,Raima,22,3.65,5
6,Riya,19,3.43,6


In [4]:
# Then we set the index to another column

df = df.set_index('name')
df.head()

Unnamed: 0_level_0,index,age,CGPA
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tuba,0,23,3.8
Plabon,1,25,3.4
Nila,2,23,3.2
Maliha,3,21,3.75
Tara,4,23,2.8


In [5]:
# You'll see that when we create a new index from an existing column the index has a name, 
# which is the original name of the column.

# We can get rid of the index completely by calling the function reset_index(). This promotes the 
# index into a column and creates a default numbered index.

df = df.reset_index()
df

Unnamed: 0,name,index,age,CGPA
0,Tuba,0,23,3.8
1,Plabon,1,25,3.4
2,Nila,2,23,3.2
3,Maliha,3,21,3.75
4,Tara,4,23,2.8
5,Raima,5,22,3.65
6,Riya,6,19,3.43


In [10]:
list2 = [{'name':'Tuba','age': 23, 'CGPA': 3.8,'address':'Dhaka','study':'DIU'},
        {'name':'Plabon','age': 25, 'CGPA': 3.4,'address':'Dhaka','study':'AUST'},
        {'name':'Nila','age': 23, 'CGPA': 3.2,'address':'Khulna','study':'DIU'},
        {'name':'Maliha','age': 22, 'CGPA': 3.75,'address':'Dhaka','study':'NSU'},
        {'name':'Tara','age': 23, 'CGPA': 2.8,'address':'Khulna','study':'DIU'},
        {'name':'Raima','age': 22, 'CGPA': 3.65,'address':'Dhaka','study':'EWU'},
        {'name':'Rima','age': 22, 'CGPA': 3.45,'address':'Dhaka','study':'DIU'}
        ]

df2 = pd.DataFrame(list)
df2

Unnamed: 0,name,age,CGPA,address,study
0,Tuba,23,3.8,Dhaka,DIU
1,Plabon,25,3.4,Dhaka,AUST
2,Nila,23,3.2,Khulna,DIU
3,Maliha,22,3.75,Dhaka,NSU
4,Tara,23,2.8,Khulna,DIU
5,Raima,22,3.65,Dhaka,EWU
6,Rima,22,3.45,Dhaka,DIU


In [16]:
# I want to see a list of all the unique values in a given column. In this 
# DataFrame, we see that the possible values for the sum age are using the 
# unique function on the DataFrame. This is similar to the SQL distinct operator

# Here we can run unique on the sum level of our current DataFrame 

df2['age'].unique()

array([23, 25, 22], dtype=int64)

In [12]:
# We see that there are only three different values, 22, 23 and 25

In [15]:
df2[df2['age']==22]

Unnamed: 0,name,age,CGPA,address,study
3,Maliha,22,3.75,Dhaka,NSU
5,Raima,22,3.65,Dhaka,EWU
6,Rima,22,3.45,Dhaka,DIU


In [22]:
# a list of column names that we want to keep then project those and 
# assign the resulting DataFrame to our df2 variable.

columns_to_keep = ['name','age','study']
df2 = df2[columns_to_keep]
df2

Unnamed: 0,name,age,study
0,Tuba,23,DIU
1,Plabon,25,AUST
2,Nila,23,DIU
3,Maliha,22,NSU
4,Tara,23,DIU
5,Raima,22,EWU
6,Rima,22,DIU


In [27]:
# we can set multiple index too

list3= [{'name':'Tuba','age': 23, 'CGPA': 3.8,'address':'Dhaka','study':'DIU'},
        {'name':'Plabon','age': 25, 'CGPA': 3.4,'address':'Dhaka','study':'AUST'},
       
        {'name':'Maliha','age': 22, 'CGPA': 3.75,'address':'Dhaka','study':'NSU'},
        
        {'name':'Raima','age': 22, 'CGPA': 3.65,'address':'Dhaka','study':'EWU'},
        {'name':'Rima','age': 22, 'CGPA': 3.45,'address':'Dhaka','study':'DIU'}
        ]

df3=pd.DataFrame(list3)
df3

Unnamed: 0,name,age,CGPA,address,study
0,Tuba,23,3.8,Dhaka,DIU
1,Plabon,25,3.4,Dhaka,AUST
2,Maliha,22,3.75,Dhaka,NSU
3,Raima,22,3.65,Dhaka,EWU
4,Rima,22,3.45,Dhaka,DIU


In [29]:
df4 = df3.set_index(['address','age'])
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,name,CGPA,study
address,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dhaka,23,Tuba,3.8,DIU
Dhaka,25,Plabon,3.4,AUST
Dhaka,22,Maliha,3.75,NSU
Dhaka,22,Raima,3.65,EWU
Dhaka,22,Rima,3.45,DIU


In [30]:
# An immediate question which comes up is how we can query this DataFrame. We saw previously that 
# the loc attribute of the DataFrame can take multiple arguments. And it could query both the 
# row and the columns. When you use a MultiIndex, you must provide the arguments in order by the 
# level you wish to query. Inside of the index, each column is called a level and the outermost 
# column is level zero. 

# If we want to see the student results from dhaka of age 22, the first argument would be dhaka and the second would be 22

In [43]:
list5= [{'name':'Tuba','age': 23, 'CGPA': 3.8,'address':'Dhaka','study':'DIU'},
        {'name':'Plabon','age': 25, 'CGPA': 3.4,'address':'Dhaka','study':'AUST'},
        {'name':'Plabi','age': 25, 'CGPA': 3.4,'address':'khulna','study':'AUST'},
        {'name':'Maliha','age': 22, 'CGPA': 3.75,'address':'Dhaka','study':'NSU'},
        
        {'name':'Raima','age': 22, 'CGPA': 3.65,'address':'Dhaka','study':'EWU'},
        {'name':'Rima','age': 22, 'CGPA': 3.45,'address':'Dhaka','study':'DIU'}
        ]

df5=pd.DataFrame(list5)
df5

Unnamed: 0,name,age,CGPA,address,study
0,Tuba,23,3.8,Dhaka,DIU
1,Plabon,25,3.4,Dhaka,AUST
2,Plabi,25,3.4,khulna,AUST
3,Maliha,22,3.75,Dhaka,NSU
4,Raima,22,3.65,Dhaka,EWU
5,Rima,22,3.45,Dhaka,DIU


In [44]:
df5= df5.set_index(['address','age'])
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,name,CGPA,study
address,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dhaka,23,Tuba,3.8,DIU
Dhaka,25,Plabon,3.4,AUST
khulna,25,Plabi,3.4,AUST
Dhaka,22,Maliha,3.75,NSU
Dhaka,22,Raima,3.65,EWU
Dhaka,22,Rima,3.45,DIU


In [47]:
df5.loc['Dhaka',22]


  df5.loc['Dhaka',22]


Unnamed: 0_level_0,Unnamed: 1_level_0,name,CGPA,study
address,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dhaka,22,Maliha,3.75,NSU
Dhaka,22,Raima,3.65,EWU
Dhaka,22,Rima,3.45,DIU


In [51]:
df5.loc[ [('Dhaka',22),  #student who are 22 and 25 years old and lives in dhaka
        ('Dhaka',25)] ]

Unnamed: 0_level_0,Unnamed: 1_level_0,name,CGPA,study
address,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dhaka,22,Maliha,3.75,NSU
Dhaka,22,Raima,3.65,EWU
Dhaka,22,Rima,3.45,DIU
Dhaka,25,Plabon,3.4,AUST
