In [1]:
#when to use iloc, loc, ix
#When to use apply, map, applymap
import pandas as pd
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
#lets start with map method
#which is a series method
#suppose i want to create a dummy variable of sex with 0 and 1 instead of male and female
#map allows you to use an existing value of a series to a different set of values
#lets create a new column
train['Sex_Num'] = train.Sex.map({'female':0, 'male':1})

In [3]:
#lets compare rows 0 to 4 of Sex and Sex_Num
train.loc[0:4, ['Sex', 'Sex_Num']]

Unnamed: 0,Sex,Sex_Num
0,male,1
1,female,0
2,female,0
3,female,0
4,male,1


In [4]:
#there are more things you could do with map,
#shown above is the best we do with map


In [5]:
#Now lets look into Apply
#Apply is both a series and dataframe method
#Lets look for Apply for Series Method
#Apply applies a function to each element in a series
#Suppose i want to create a new column to get the total length of Names column
#how many characters are there in the Name column
train['Name_length'] = train.Name.apply(len)

In [7]:
train.loc[0:4, ['Name', 'Name_length']]#this line compares two columns

Unnamed: 0,Name,Name_length
0,"Braund, Mr. Owen Harris",23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51
2,"Heikkinen, Miss. Laina",22
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44
4,"Allen, Mr. William Henry",24


In [None]:
#another example of 'apply'
#the Fare column needs to be rounded off

In [8]:
#we can apply as a numpy function too
import numpy as np
train['Fare_Round'] = train.Fare.apply(np.ceil)

In [9]:
train.loc[0:4, ['Fare', 'Fare_Round']]

Unnamed: 0,Fare,Fare_Round
0,7.25,8.0
1,71.2833,72.0
2,7.925,8.0
3,53.1,54.0
4,8.05,9.0


In [10]:
#Another example of 'apply'
#lets extract the last name of each name given in name column
#the part before the comma
#since we are dealing with str
train.Name.str.split(',').head()

0                           [Braund,  Mr. Owen Harris]
1    [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                            [Heikkinen,  Miss. Laina]
3      [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                          [Allen,  Mr. William Henry]
Name: Name, dtype: object

In [11]:
#original table of name column is string format
#above code has converted into list of strings
#first element now is Braund
#Second element now is Mr.Own....
#i want to apply above code and extract first element in the string
train.Name.str.split(',').apply(lambda x:x[0]).head()

0       Braund
1      Cumings
2    Heikkinen
3     Futrelle
4        Allen
Name: Name, dtype: object

In [13]:
#so far we been using apply with series 
#now lets use apply with dataframe method
#it applies function to either of axis
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [15]:
drinks.loc[:, 'beer_servings' : 'wine_servings'].apply(max, axis = 0)
#here axis = 0 is moving from top to bottom ways
#it finds the max values of each column
#if axis = 1, then it will be row wise max, from left to right

Unnamed: 0,beer_servings,spirit_servings,wine_servings
0,0,0,0
1,89,132,54
2,25,0,14
3,245,138,312
4,217,57,45
...,...,...,...
188,333,100,3
189,111,2,1
190,6,0,0
191,32,19,4


In [17]:
#sometimes i want to know which column has the maximum value
drinks.loc[:, 'beer_servings' : 'wine_servings'].apply(np.argmax, axis = 1)

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return bound(*args, **kwds)


0        beer_servings
1      spirit_servings
2        beer_servings
3        wine_servings
4        beer_servings
            ...       
188      beer_servings
189      beer_servings
190      beer_servings
191      beer_servings
192      beer_servings
Length: 193, dtype: object

In [None]:
#above explains, row wise, 
#afghanistan: has no values, hence the first column is selected byitself, beer servings
#Albania: Spirit Servings is the highest
#and so on

In [19]:
#ApplyMap
#it applies a function to every element of a dataframe
#it doesnt apply to any direction(left to right or top to bottom), it applies to all
#it will change every element in the dataframe
drinks.loc[:, 'beer_servings' : 'wine_servings'].applymap(float)
#this function applies to every element in the dataframe


Unnamed: 0,beer_servings,spirit_servings,wine_servings
0,0.0,0.0,0.0
1,89.0,132.0,54.0
2,25.0,0.0,14.0
3,245.0,138.0,312.0
4,217.0,57.0,45.0
...,...,...,...
188,333.0,100.0,3.0
189,111.0,2.0,1.0
190,6.0,0.0,0.0
191,32.0,19.0,4.0


In [20]:
#but remember above floating numbers are just temporary
#to make it permanent:
drinks.loc[:, 'beer_servings' : 'wine_servings'] = drinks.loc[:, 'beer_servings' : 'wine_servings'].applymap(float)

In [21]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0.0,0.0,0.0,0.0,Asia
1,Albania,89.0,132.0,54.0,4.9,Europe
2,Algeria,25.0,0.0,14.0,0.7,Africa
3,Andorra,245.0,138.0,312.0,12.4,Europe
4,Angola,217.0,57.0,45.0,5.9,Africa


In [None]:
#therefore from int to float it has been saved permanently