This material should help you get the ideas clearer from the first meeting:

In [2]:
# Reviewing how to make a df, friends, built on 5 lists. Each element in list is same type

names=["Tomás", "Pauline", "Pablo", "Bjork","Alan","Juana"]
woman=[False,True,False,False,False,True]
ages=[32,33,28,30,32,27]
country=["Chile", "Senegal", "Spain", "Norway","Peru","Peru"]
education=["Bach", "Bach", "Master", "PhD","Bach","Master"]

# now in a dict:
# name you want for a column for the above list. Numbers are auto produced in the df table we make
# Don't use a . in the name, you can put in a space like 'born in'
data={'name':names, 'age':ages, 'girl':woman,'born In':country, 'degree':education}

#now into a DF
import pandas as pd

friends=pd.DataFrame.from_dict(data)
# seeing it:
friends

Unnamed: 0,name,age,girl,born In,degree
0,Tomás,32,False,Chile,Bach
1,Pauline,33,True,Senegal,Bach
2,Pablo,28,False,Spain,Master
3,Bjork,30,False,Norway,PhD
4,Alan,32,False,Peru,Bach
5,Juana,27,True,Peru,Master


The result is what you expected, but you need to be sure of what data structure you have:

In [4]:
#what is it? Shows it is a data frame, not a list. Notice you do not type (data), that would show it is a 'dict'
type(friends)

pandas.core.frame.DataFrame

In [4]:
#this is good
friends.age

0    32
1    33
2    28
3    30
4    32
5    27
Name: age, dtype: int64

In [6]:
#what is it?
# This is a series, not a dataframe
# orig was a list, pandas turns list into a series
type(friends.age)

pandas.core.series.Series

In [5]:
#this is good
# This is a series too
friends['age']

0    32
1    33
2    28
3    30
4    32
5    27
Name: age, dtype: int64

In [7]:
#what is it?
type(friends['age'])

pandas.core.series.Series

In [8]:
#this is bad; won't understand
friends.iloc[['age']]

ValueError: invalid literal for int() with base 10: 'age'

In [9]:
#this is bad
friends.loc[['age']]

KeyError: "None of [['age']] are in the [index]"

In [None]:
#this is bad
friends['age','born In']

In [6]:
#this is good
# Uses [] for two columns, inside there is a list of names
# Don't use () - that is a tuple
friends[['age','born In']]

Unnamed: 0,age,born In
0,32,Chile
1,33,Senegal
2,28,Spain
3,30,Norway
4,32,Peru
5,27,Peru


In [None]:
# what is it?
type(friends[['age','born In']])

In [None]:
#this is bad
friends.'born In'

In [7]:
#this is good
#loc tells what rows and columns you want
friends.loc[:,['age','born In']]

Unnamed: 0,age,born In
0,32,Chile
1,33,Senegal
2,28,Spain
3,30,Norway
4,32,Peru
5,27,Peru


In [10]:
type(friends.loc[:,['age','born In']])

pandas.core.frame.DataFrame

In [10]:
#this is bad - can't slice
friends.loc[:,['age':'born In']]

SyntaxError: invalid syntax (<ipython-input-10-da35a4e2500b>, line 2)

In [None]:
#this is bad - does not use names in iloc. Loc can use names of columns
# iloc is for positions, not names; but allows slices when you correctly identify positions
friends.iloc[:,['age','born In']]

In [18]:
# this is good (but different)
friends.iloc[:,1:4]

Unnamed: 0,age,girl,born In
0,32,False,Chile
1,33,True,Senegal
2,28,False,Spain
3,30,False,Norway
4,32,False,Peru
5,27,True,Peru


In [12]:
# what is it?
type(friends.iloc[:,1:4])

pandas.core.frame.DataFrame

In [13]:
# this is good - can do list of indexes or column positions, just not names
#This first : means 'give all the rows' and then we identify the column positions
friends.iloc[:,[1,3]]

Unnamed: 0,age,born In
0,32,Chile
1,33,Senegal
2,28,Spain
3,30,Norway
4,32,Peru
5,27,Peru


In [14]:
#what is it?
type(friends.iloc[:,[1,3]])

pandas.core.frame.DataFrame

In [15]:
# This will give you a subset, those over 30
friends[friends.age>30]

Unnamed: 0,name,age,girl,born In,degree
0,Tomás,32,False,Chile,Bach
1,Pauline,33,True,Senegal,Bach
4,Alan,32,False,Peru,Bach


Some people like coding with the filter language:

In [16]:
# 
filter1=friends.age>30
friends[filter1]

Unnamed: 0,name,age,girl,born In,degree
0,Tomás,32,False,Chile,Bach
1,Pauline,33,True,Senegal,Bach
4,Alan,32,False,Peru,Bach


In [17]:
# where creates missing values, if something is not in conditions, gives NaN (missing vlaue)

friends.where(filter1)

Unnamed: 0,name,age,girl,born In,degree
0,Tomás,32.0,0.0,Chile,Bach
1,Pauline,33.0,1.0,Senegal,Bach
2,,,,,
3,,,,,
4,Alan,32.0,0.0,Peru,Bach
5,,,,,


In [18]:
# query does not use name of dataframe, works on subset does not use whole data again like "where"
filter1a='age>30'
friends.query(filter1a)

Unnamed: 0,name,age,girl,born In,degree
0,Tomás,32,False,Chile,Bach
1,Pauline,33,True,Senegal,Bach
4,Alan,32,False,Peru,Bach


In [19]:
#Asking if these things are dataframes, will get true or false answer
# the , \ at the end of lines 1 and 2 shows that there is a new line in the command
isinstance(friends[filter1], pd.DataFrame), \
isinstance(friends.where(filter1), pd.DataFrame), \
isinstance(friends.query(filter1a), pd.DataFrame)

(True, True, True)

When you have Boolean values (True/False) you can simplify:

In [20]:
#from:
friends[friends.girl==False]

Unnamed: 0,name,age,girl,born In,degree
0,Tomás,32,False,Chile,Bach
2,Pablo,28,False,Spain,Master
3,Bjork,30,False,Norway,PhD
4,Alan,32,False,Peru,Bach


In [21]:
# to...
friends[~friends.girl]

Unnamed: 0,name,age,girl,born In,degree
0,Tomás,32,False,Chile,Bach
2,Pablo,28,False,Spain,Master
3,Bjork,30,False,Norway,PhD
4,Alan,32,False,Peru,Bach


You can have two filters:

In [22]:
# this will not work
friends[~friends.girl & friends.degree=='Bach']

  result = method(y)


TypeError: invalid type comparison

In [23]:
# this will (with parentheses)
friends[(~friends.girl) & (friends.degree=='Bach')]

Unnamed: 0,name,age,girl,born In,degree
0,Tomás,32,False,Chile,Bach
4,Alan,32,False,Peru,Bach


Other times you want a values once a filter was applied:

In [24]:
# youngest male:
# Does not show you what you want to get - gives you all the boys
# We don't have any filters, not looking at any subset in the second command, looking at whole data
friends[(~friends.girl) & (friends.age.min())] # this is wrong!

Unnamed: 0,name,age,girl,born In,degree
0,Tomás,32,False,Chile,Bach
2,Pablo,28,False,Spain,Master
3,Bjork,30,False,Norway,PhD
4,Alan,32,False,Peru,Bach


In [25]:
#NOTE: the friends.age==friends.age.min is a filter, it is comparing things
# Asking that both conditions are met, which both are not true, so you don't get anything

friends[(~friends.girl) & (friends.age==friends.age.min())] # this is wrong too!

Unnamed: 0,name,age,girl,born In,degree


In [26]:
# This is not looking at the subset of only males in the dataframe, instead looking at whole data
friends.age.min()

27

You got empty answer because there is no man aged 27.

In [27]:
# this is correct
#Can use a comment # to see where in the command you made a mistake

friends[~friends.girl].age.min()

28

Once you know the right age, you have to put it in the right place:

In [28]:
# Gives you full row / info of the youngest male in dataframe
#Give me the age of the youngest boy, but if a girl is also 28 
# this won't be correct because it will also give her
friends[friends.age==friends[~friends.girl].age.min()]

Unnamed: 0,name,age,girl,born In,degree
2,Pablo,28,False,Spain,Master


In [29]:
# or
friends.where(friends.age==friends[~friends.girl].age.min())

Unnamed: 0,name,age,girl,born In,degree
0,,,,,
1,,,,,
2,Pablo,28.0,0.0,Spain,Master
3,,,,,
4,,,,,
5,,,,,


In [30]:
# or; dropna gets rid of missing value rows

friends.where(friends.age==friends[~friends.girl].age.min()).dropna()

Unnamed: 0,name,age,girl,born In,degree
2,Pablo,28.0,0.0,Spain,Master


The problem is that 'friends' are not subset and the age keeps being that of the youngest woman:

In [31]:
# bad:
# where is producing subset that uses all the friends data from the beginning
# Must specify dataframe
#Want to conditions, boys, and youngest of boys
friends.where(~friends.girl).where(friends.age==friends.age.min())

Unnamed: 0,name,age,girl,born In,degree
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,


That's the advantage of **query**:

In [32]:
# Operating on last subset, take name of column of last subset

friends.query('~girl').query('age==age.min()')

Unnamed: 0,name,age,girl,born In,degree
2,Pablo,28,False,Spain,Master


In [33]:
#but - make a copy of friends
# inplace makes a subset and changes dataset; inplace - means after you do command, change the orig data
# inplace and the copy lets you do stuff to the df students but won't interfere with friends dataframe

students=friends.copy()

students.where(~students.girl,inplace=True) #real subset
students.where(students.age==students.age.min())

Unnamed: 0,name,age,girl,born In,degree
0,,,,,
1,,,,,
2,Pablo,28.0,0.0,Spain,Master
3,,,,,
4,,,,,
5,,,,,


Let's vary the data a little:

In [34]:
names=["Tomás", "Pauline", "Pablo", "Bjork","Alan","Juana"]
woman=[False,True,False,False,False,True]
ages=[32,28,28,30,32,27]
country=["Chile", "Senegal", "Spain", "Norway","Peru","Peru"]
education=["Bach", "Bach", "Master", "PhD","Bach","Master"]

# now in a dict:
data={'name':names, 'age':ages, 'girl':woman,'born In':country, 'degree':education}

#now into a DF
import pandas as pd

friends2=pd.DataFrame.from_dict(data)
# seeing it:
friends2

Unnamed: 0,name,age,girl,born In,degree
0,Tomás,32,False,Chile,Bach
1,Pauline,28,True,Senegal,Bach
2,Pablo,28,False,Spain,Master
3,Bjork,30,False,Norway,PhD
4,Alan,32,False,Peru,Bach
5,Juana,27,True,Peru,Master


There is a girl with the same age as the youngest boy, then:

In [35]:
# You will get two answers for this, NOT youngest male in group
# keeping the row that has the youngest age of the boys, which is 28
# This is why you get a row with a girl, she has the age of the youngest boy

friends2.where(friends2.age==friends2[~friends2.girl].age.min()).dropna()

Unnamed: 0,name,age,girl,born In,degree
1,Pauline,28.0,1.0,Senegal,Bach
2,Pablo,28.0,0.0,Spain,Master


We need a previous strategy:

In [36]:
# bad implementation:
friends2.where(friends2.age==friends2[~friends2.girl].age.min() & friends2.girl==False).dropna()

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [37]:
# bad implementation:
friends2.where(friends2.age==friends2[~friends2.girl].age.min() & ~friends2.girl).dropna()

Unnamed: 0,name,age,girl,born In,degree


In [38]:
# just parentheses to make it work!
friends2.where((friends2.age==friends2[~friends2.girl].age.min()) & (~friends2.girl)).dropna()

Unnamed: 0,name,age,girl,born In,degree
2,Pablo,28.0,0.0,Spain,Master


This one still works!

In [39]:
# Simpler than where because it works of previous subset on the left in the command

friends2.query('~girl').query('age==age.min()')

Unnamed: 0,name,age,girl,born In,degree
2,Pablo,28,False,Spain,Master


In [None]:
students2=friends2.copy()

students2.where(~students2.girl,inplace=True) #real subset
students2.where(students2.age==students2.age.min()).dropna()