# Selecting Data: loc(), iloc[]

In [434]:
import pandas as pd
import numpy as np
df = pd.read_csv('csv files/players_20.csv')
# df

# set index
df.set_index('short_name', inplace=True)

# new index becomes short_name column
# df

# select columns
df = df[["long_name", "age", "dob", "height_cm", "weight_kg", "nationality","club"]]
df

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid
...,...,...,...,...,...,...,...
Shao Shuai,邵帅,22,1997-03-10,186,79,China PR,Beijing Renhe FC
Xiao Mingjie,Mingjie Xiao,22,1997-01-01,177,66,China PR,Shanghai SIPG FC
Zhang Wei,张威,19,2000-05-16,186,75,China PR,Hebei China Fortune FC
Wang Haijian,汪海健,18,2000-08-02,185,74,China PR,Shanghai Greenland Shenhua FC


# 1 loc()

## 1.1 Selecting with a single value

In [435]:
# get all data on L. Messi
df.loc['L. Messi']

long_name      Lionel Andrés Messi Cuccittini
age                                        32
dob                                1987-06-24
height_cm                                 170
weight_kg                                  72
nationality                         Argentina
club                             FC Barcelona
Name: L. Messi, dtype: object

In [436]:
# get the height of L. Messi
df.loc['L. Messi', 'height_cm']

170

In [437]:
# get the height of Cristiano Ronaldo
df.loc['Cristiano Ronaldo', 'height_cm']

187

In [438]:
# get all rows inside the 'height_cm' column
df.loc[:, 'height_cm']

'''
: (colon) is a special symbol that selects all element inside a row or column
'''

'\n: (colon) is a special symbol that selects all element inside a row or column\n'

In [439]:
# get all the columns that correspond to the index 'L. Messi'
df.loc['L. Messi', :]

# same result as df.loc['L. Messi']

long_name      Lionel Andrés Messi Cuccittini
age                                        32
dob                                1987-06-24
height_cm                                 170
weight_kg                                  72
nationality                         Argentina
club                             FC Barcelona
Name: L. Messi, dtype: object

## 1.2 Selecting with a list of values 

In [440]:
# get all data about Messi and Ronaldo
df.loc[['L. Messi', 'Cristiano Ronaldo']]

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus


In [441]:
# get the height of Messi and Ronaldo
df.loc[['L. Messi', 'Cristiano Ronaldo'], 'height_cm']

short_name
L. Messi             170
Cristiano Ronaldo    187
Name: height_cm, dtype: int64

In [442]:
# get height and weight of Messi
df.loc['L. Messi', ['height_cm', 'weight_kg']]

height_cm    170
weight_kg     72
Name: L. Messi, dtype: object

In [443]:
# get the height and weight of Messi and Ronaldo
df.loc[['L. Messi', 'Cristiano Ronaldo'], ['height_cm', 'weight_kg']]

Unnamed: 0_level_0,height_cm,weight_kg
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1
L. Messi,170,72
Cristiano Ronaldo,187,83


## 1.3 Selecting a range of data with a slice

In [444]:
# slice column labels
players = ['L. Messi', 'Cristiano Ronaldo']
df.loc[players, 'age':'club']

Unnamed: 0_level_0,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
L. Messi,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,34,1985-02-05,187,83,Portugal,Juventus


In [445]:
# slice index labels
columns = ['age', 'dob', 'height_cm', 'weight_kg']

#get top1 and top10 player
df.loc[df.index[:10], columns]
# or 
df.loc['L. Messi':'M. Salah', columns]


# df.index[:10] gets first 10 index elements

Unnamed: 0_level_0,age,dob,height_cm,weight_kg
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
L. Messi,32,1987-06-24,170,72
Cristiano Ronaldo,34,1985-02-05,187,83
Neymar Jr,27,1992-02-05,175,68
J. Oblak,26,1993-01-07,188,87
E. Hazard,28,1991-01-07,175,74
K. De Bruyne,28,1991-06-28,181,70
M. ter Stegen,27,1992-04-30,187,85
V. van Dijk,27,1991-07-08,193,92
L. Modrić,33,1985-09-09,172,66
M. Salah,27,1992-06-15,175,71


## 1.4 Selecting with conditions

In [446]:
# one condition: select player with height above 180cm
columns = ['age', 'dob', 'height_cm', 'weight_kg']

df.loc[ df['height_cm']>180, columns]

Unnamed: 0_level_0,age,dob,height_cm,weight_kg
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cristiano Ronaldo,34,1985-02-05,187,83
J. Oblak,26,1993-01-07,188,87
K. De Bruyne,28,1991-06-28,181,70
M. ter Stegen,27,1992-04-30,187,85
V. van Dijk,27,1991-07-08,193,92
...,...,...,...,...
P. Martin,20,1999-01-05,188,84
Shao Shuai,22,1997-03-10,186,79
Zhang Wei,19,2000-05-16,186,75
Wang Haijian,18,2000-08-02,185,74


In [447]:
# multiple conditions: heigt above 180cm and from Argentina
df.loc[ (df['height_cm']>180) & (df['nationality'] == 'Argentina') , :]

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M. Icardi,Mauro Emanuel Icardi Rivero,26,1993-02-19,181,75,Argentina,Inter
G. Higuaín,Gonzalo Gerardo Higuaín,31,1987-12-10,186,89,Argentina,Juventus
E. Garay,Ezequiel Marcelo Garay,32,1986-10-10,189,90,Argentina,Valencia CF
N. Otamendi,Nicolás Hernán Otamendi,31,1988-02-12,183,81,Argentina,Manchester City
G. Rulli,Gerónimo Rulli,27,1992-05-20,189,84,Argentina,Montpellier HSC
...,...,...,...,...,...,...,...
T. Durso,Tomás Durso,20,1999-02-26,185,80,Argentina,Gimnasia y Esgrima La Plata
J. Hass,Joaquín Hass,21,1998-03-27,186,88,Argentina,Club Atlético Colón
R. Ferrario,Rafael Ferrario,19,2000-04-30,186,76,Argentina,Club Atlético Huracán
L. Finochietto,Leandro Finochietto,22,1997-04-25,186,77,Argentina,Argentinos Juniors


# 2  iloc[]

In [448]:
df = pd.read_csv('csv files/players_20.csv')
# df

# new index becomes short_name column
# df

# select columns
df = df[["long_name", "short_name", "age", "dob", "height_cm", "weight_kg", "nationality","club"]]
df

Unnamed: 0,long_name,short_name,age,dob,height_cm,weight_kg,nationality,club
0,Lionel Andrés Messi Cuccittini,L. Messi,32,1987-06-24,170,72,Argentina,FC Barcelona
1,Cristiano Ronaldo dos Santos Aveiro,Cristiano Ronaldo,34,1985-02-05,187,83,Portugal,Juventus
2,Neymar da Silva Santos Junior,Neymar Jr,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
3,Jan Oblak,J. Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
4,Eden Hazard,E. Hazard,28,1991-01-07,175,74,Belgium,Real Madrid
...,...,...,...,...,...,...,...,...
18273,邵帅,Shao Shuai,22,1997-03-10,186,79,China PR,Beijing Renhe FC
18274,Mingjie Xiao,Xiao Mingjie,22,1997-01-01,177,66,China PR,Shanghai SIPG FC
18275,张威,Zhang Wei,19,2000-05-16,186,75,China PR,Hebei China Fortune FC
18276,汪海健,Wang Haijian,18,2000-08-02,185,74,China PR,Shanghai Greenland Shenhua FC


## 2.1 Selecting a range of data with a slice (1.3 but with iloc[])

In [449]:
# slice column labels
df.iloc[0:2, 0:3]

Unnamed: 0,long_name,short_name,age
0,Lionel Andrés Messi Cuccittini,L. Messi,32
1,Cristiano Ronaldo dos Santos Aveiro,Cristiano Ronaldo,34


## 2.2 Selecting with conditions (1.4 but with iloc[])

In [450]:
# one condition: select player with ehgih above 180cm

df.iloc[ list(df['height_cm']>180), 2:6]

Unnamed: 0,age,dob,height_cm,weight_kg
1,34,1985-02-05,187,83
3,26,1993-01-07,188,87
5,28,1991-06-28,181,70
6,27,1992-04-30,187,85
7,27,1991-07-08,193,92
...,...,...,...,...
18272,20,1999-01-05,188,84
18273,22,1997-03-10,186,79
18275,19,2000-05-16,186,75
18276,18,2000-08-02,185,74


In [451]:
# multiple conditions: heigt above 180cm and from Argentina
df.iloc[ list((df['height_cm']>180) & (df['nationality'] == 'Argentina')) , :]

Unnamed: 0,long_name,short_name,age,dob,height_cm,weight_kg,nationality,club
93,Mauro Emanuel Icardi Rivero,M. Icardi,26,1993-02-19,181,75,Argentina,Inter
98,Gonzalo Gerardo Higuaín,G. Higuaín,31,1987-12-10,186,89,Argentina,Juventus
210,Ezequiel Marcelo Garay,E. Garay,32,1986-10-10,189,90,Argentina,Valencia CF
224,Nicolás Hernán Otamendi,N. Otamendi,31,1988-02-12,183,81,Argentina,Manchester City
342,Gerónimo Rulli,G. Rulli,27,1992-05-20,189,84,Argentina,Montpellier HSC
...,...,...,...,...,...,...,...,...
17412,Tomás Durso,T. Durso,20,1999-02-26,185,80,Argentina,Gimnasia y Esgrima La Plata
17526,Joaquín Hass,J. Hass,21,1998-03-27,186,88,Argentina,Club Atlético Colón
17657,Rafael Ferrario,R. Ferrario,19,2000-04-30,186,76,Argentina,Club Atlético Huracán
17712,Leandro Finochietto,L. Finochietto,22,1997-04-25,186,77,Argentina,Argentinos Juniors


# 3 Setting a New Value

In [452]:
df = pd.read_csv('csv files/players_20.csv')

# set index
df.set_index('short_name', inplace=True)

# select columns
df = df[["long_name", "age", "dob", "height_cm", "weight_kg", "nationality","club"]]

## 3.1 Set value to one cell

In [453]:
# update L. Messi height
df.loc["L. Messi", 'height_cm'] = 175
df.loc["L. Messi", 'height_cm']

175

## 3.2 Set value to entire column 

In [454]:
# set height of all players to 190
df.loc[:, 'height_cm'] = 190
df.loc[:, 'height_cm']

short_name
L. Messi             190
Cristiano Ronaldo    190
Neymar Jr            190
J. Oblak             190
E. Hazard            190
                    ... 
Shao Shuai           190
Xiao Mingjie         190
Zhang Wei            190
Wang Haijian         190
Pan Ximing           190
Name: height_cm, Length: 18278, dtype: int64

## 3.3 Set value to entire row 

In [455]:
df

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,190,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,190,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,190,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,190,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,190,74,Belgium,Real Madrid
...,...,...,...,...,...,...,...
Shao Shuai,邵帅,22,1997-03-10,190,79,China PR,Beijing Renhe FC
Xiao Mingjie,Mingjie Xiao,22,1997-01-01,190,66,China PR,Shanghai SIPG FC
Zhang Wei,张威,19,2000-05-16,190,75,China PR,Hebei China Fortune FC
Wang Haijian,汪海健,18,2000-08-02,190,74,China PR,Shanghai Greenland Shenhua FC


In [456]:
# get all columns that correspond to player ranked last in FIFA

'''
negative index (-1) got the last player by index
'''

df.iloc[-1, :]

long_name                         潘喜明
age                                26
dob                        1993-01-11
height_cm                         190
weight_kg                          78
nationality                  China PR
club           Hebei China Fortune FC
Name: Pan Ximing, dtype: object

In [457]:
df.iloc[-1, :] = np.nan
df.iloc[-1, :]

long_name      NaN
age            NaN
dob            NaN
height_cm      NaN
weight_kg      NaN
nationality    NaN
club           NaN
Name: Pan Ximing, dtype: object

## 3.4 Set value to multiple cells

In [458]:
# set value for all items matching the list of labels
df.loc[['L. Messi', 'Cristiano Ronaldo'], 'height_cm'] = 175
df.loc[['L. Messi', 'Cristiano Ronaldo'], 'height_cm']

short_name
L. Messi             175.0
Cristiano Ronaldo    175.0
Name: height_cm, dtype: float64

## 3.5 Set value for rows matching a condition

In [459]:
# set value for rows matching a condition
columns = ['age', 'dob', 'height_cm', 'weight_kg']
df.loc[df['height_cm'] > 180, columns] = 0
df

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32.0,1987-06-24,175.0,72.0,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34.0,1985-02-05,175.0,83.0,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,0.0,0,0.0,0.0,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,0.0,0,0.0,0.0,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,0.0,0,0.0,0.0,Belgium,Real Madrid
...,...,...,...,...,...,...,...
Shao Shuai,邵帅,0.0,0,0.0,0.0,China PR,Beijing Renhe FC
Xiao Mingjie,Mingjie Xiao,0.0,0,0.0,0.0,China PR,Shanghai SIPG FC
Zhang Wei,张威,0.0,0,0.0,0.0,China PR,Hebei China Fortune FC
Wang Haijian,汪海健,0.0,0,0.0,0.0,China PR,Shanghai Greenland Shenhua FC


# 4 Drop Rows or Columns: drop()

In [460]:
df = pd.read_csv('csv files/players_20.csv')

# set index
df.set_index('short_name', inplace=True)

# select columns
df = df[["long_name", "age", "dob", "height_cm", "weight_kg", "nationality","club"]]

## 4.1 Drop Rows

In [461]:
# drop one row

# axis parameter method
'''
axis paremeter lets drop() knows if its a row or column. 
0 represents rows and 1 represents columns
'''
df.drop('L. Messi', axis=0)

#index parameter method
df.drop(index=['L. Messi'])
#df

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid
K. De Bruyne,Kevin De Bruyne,28,1991-06-28,181,70,Belgium,Manchester City
...,...,...,...,...,...,...,...
Shao Shuai,邵帅,22,1997-03-10,186,79,China PR,Beijing Renhe FC
Xiao Mingjie,Mingjie Xiao,22,1997-01-01,177,66,China PR,Shanghai SIPG FC
Zhang Wei,张威,19,2000-05-16,186,75,China PR,Hebei China Fortune FC
Wang Haijian,汪海健,18,2000-08-02,185,74,China PR,Shanghai Greenland Shenhua FC


In [462]:
# drop two or more rows and update data (inpalce=True)
df.drop(index=['L. Messi', 'Cristiano Ronaldo'], inplace=True)
df

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid
K. De Bruyne,Kevin De Bruyne,28,1991-06-28,181,70,Belgium,Manchester City
M. ter Stegen,Marc-André ter Stegen,27,1992-04-30,187,85,Germany,FC Barcelona
...,...,...,...,...,...,...,...
Shao Shuai,邵帅,22,1997-03-10,186,79,China PR,Beijing Renhe FC
Xiao Mingjie,Mingjie Xiao,22,1997-01-01,177,66,China PR,Shanghai SIPG FC
Zhang Wei,张威,19,2000-05-16,186,75,China PR,Hebei China Fortune FC
Wang Haijian,汪海健,18,2000-08-02,185,74,China PR,Shanghai Greenland Shenhua FC


## 4.2 Drop Columns

In [463]:
df = pd.read_csv('csv files/players_20.csv')

# set index
df.set_index('short_name', inplace=True)

# select columns
df = df[["long_name", "age", "dob", "height_cm", "weight_kg", "nationality","club"]]

In [464]:
# drop one column

# axis parameter
df.drop('long_name', axis=1)

# columns parameter
df.drop(columns=['long_name'])

Unnamed: 0_level_0,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
L. Messi,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,34,1985-02-05,187,83,Portugal,Juventus
Neymar Jr,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,28,1991-01-07,175,74,Belgium,Real Madrid
...,...,...,...,...,...,...
Shao Shuai,22,1997-03-10,186,79,China PR,Beijing Renhe FC
Xiao Mingjie,22,1997-01-01,177,66,China PR,Shanghai SIPG FC
Zhang Wei,19,2000-05-16,186,75,China PR,Hebei China Fortune FC
Wang Haijian,18,2000-08-02,185,74,China PR,Shanghai Greenland Shenhua FC


In [465]:
# drop column(s) by position/index (drop last and second to last column) and update dataframe
df.drop(columns=df.columns[[-1, -2]], inplace=True)


# or
df.drop(columns=['age', 'height_cm'], inplace=True)

df

Unnamed: 0_level_0,long_name,dob,weight_kg
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
L. Messi,Lionel Andrés Messi Cuccittini,1987-06-24,72
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,1985-02-05,83
Neymar Jr,Neymar da Silva Santos Junior,1992-02-05,68
J. Oblak,Jan Oblak,1993-01-07,87
E. Hazard,Eden Hazard,1991-01-07,74
...,...,...,...
Shao Shuai,邵帅,1997-03-10,79
Xiao Mingjie,Mingjie Xiao,1997-01-01,66
Zhang Wei,张威,2000-05-16,75
Wang Haijian,汪海健,2000-08-02,74


# 5 Create Random Samples: sample() 

In [527]:
df = pd.read_csv('csv files/players_20.csv')
df.set_index('short_name', inplace=True)
df = df[["long_name", "age", "dob", "height_cm", "weight_kg", "nationality","club"]]

In [516]:
# extract 10 random elements from the "nationality column"
'''
random_state parameter is a way to "freeze" the randomness
'''

df['nationality'].sample(10, random_state=99)

short_name
R. Alvarado               Mexico
Qiao Wei                China PR
E. Paredes                 Chile
K. Stoyanov             Bulgaria
M. Chudý                Slovakia
D. Sundaram                India
R. Gagliardini             Italy
G. Debeljuh              Croatia
Kim Seung Yong    Korea Republic
M. Hansen                Denmark
Name: nationality, dtype: object

In [522]:
# extract random 20% sameple of the dataframe
df.sample(frac=0.2)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P. Terracciano,Pietro Terracciano,29,1990-03-08,193,78,Italy,Fiorentina
Yang Xiaotian,杨笑天,29,1990-03-26,178,72,China PR,Jiangsu Suning FC
Rubén Blanco,Rubén Blanco Veiga,23,1995-07-25,188,70,Spain,RC Celta
A. Warda,Amr Warda,25,1993-09-17,179,77,Egypt,Egypt
André Almeida,André Gomes Magalhaes de Almeida,28,1990-09-10,185,80,Portugal,SL Benfica
...,...,...,...,...,...,...,...
B. Vazquez,Brandon Vazquez,20,1998-10-14,191,79,United States,Atlanta United
Ham Seok Min,함석민 咸胜明,25,1994-02-14,190,85,Korea Republic,Gangwon FC
T. Costantini,Tommaso Costantini,23,1996-06-23,183,77,Italy,SPAL
M. Pfeifer,Manuel Pfeifer,19,1999-09-10,189,78,Austria,TSV Hartberg


In [526]:
# upsample: increase the sampling rate -> frac>1 ('\"replace" parameter has to be True for frac parameter > 1)
df.sample(frac=2, replace=True)

#doubled original number of rows

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
W. Britto,Willie Britto,22,1996-12-15,182,66,Ivory Coast,FC Zürich
A. Morgan,Ashtone Morgan,28,1991-02-09,180,75,Canada,Toronto FC
Zequinha,José Egas dos Santos Branco,32,1987-01-07,183,77,Portugal,Vitória de Setúbal
A. Chakraborty,Abhimoda Chakraborty,33,1985-10-27,189,79,India,India
A. Asiri,Ahmad Hassan Hassan Asiri,27,1991-11-14,177,70,Saudi Arabia,Al Taawoun
...,...,...,...,...,...,...,...
J. Grimmer,Jack Grimmer,25,1994-01-25,184,83,Scotland,Wycombe Wanderers
Madger Gomes,Madger Antonio Gomes Ajú,22,1997-02-01,180,74,Spain,Doncaster Rovers
L. Schellenberg,Lukas Schellenberg,18,2000-08-02,183,78,Germany,FC Ingolstadt 04
A. Rosati,Antonio Rosati,36,1983-06-26,195,88,Italy,Torino


# 6 Filter a Dataframe: qeuery() 

In [528]:
df = pd.read_csv('csv files/players_20.csv')
df.set_index('short_name', inplace=True)
df = df[["long_name", "age", "dob", "height_cm", "weight_kg", "nationality","club"]]

In [549]:
# select players older than 34
df.query("age> 34")

#exercise: write the equivalent boolean slicing
df[df['age'] > 34].head()

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Z. Ibrahimović,Zlatan Ibrahimović,37,1981-10-03,195,95,Sweden,LA Galaxy
Pepe,Képler Laveran Lima Ferreira,36,1983-02-26,188,81,Portugal,FC Porto
G. Buffon,Gianluigi Buffon,41,1978-01-28,192,92,Italy,Juventus
Casillas,Iker Casillas Fernández,38,1981-05-20,185,84,Spain,FC Porto
F. Quagliarella,Fabio Quagliarella,36,1983-01-31,180,79,Italy,Sampdoria


In [548]:
# select players older than 34 from Italy
df.query("age >34 and nationality =='Italy'")

#exercise: boolean slicing equivalent
df[(df['age'] > 34) & (df['nationality'] == 'Italy')].head()

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G. Buffon,Gianluigi Buffon,41,1978-01-28,192,92,Italy,Juventus
F. Quagliarella,Fabio Quagliarella,36,1983-01-31,180,79,Italy,Sampdoria
D. De Rossi,Daniele De Rossi,35,1983-07-24,184,83,Italy,Boca Juniors
A. Mirante,Antonio Mirante,35,1983-07-08,193,79,Italy,Roma
A. Diamanti,Alessandro Diamanti,36,1983-05-02,180,75,Italy,Western United FC


In [567]:
# add a "not" operator to the first example
df.query("not(age> 34)").head()

#exercise: boolean slicing equivalent
df[~(df['age'] > 34)].head()


Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid


In [575]:
# convert height to meters and select those with height above 1.8
df.query("height_cm/100 > 1.8").head()

#exercise: boolean slicing equivalent
df[(df['height_cm']/100 > 1.8)].head()


Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
K. De Bruyne,Kevin De Bruyne,28,1991-06-28,181,70,Belgium,Manchester City
M. ter Stegen,Marc-André ter Stegen,27,1992-04-30,187,85,Germany,FC Barcelona
V. van Dijk,Virgil van Dijk,27,1991-07-08,193,92,Netherlands,Liverpool


In [581]:
df['dob'].dt.year

short_name
L. Messi             1987
Cristiano Ronaldo    1985
Neymar Jr            1992
J. Oblak             1993
E. Hazard            1991
                     ... 
Shao Shuai           1997
Xiao Mingjie         1997
Zhang Wei            2000
Wang Haijian         2000
Pan Ximing           1993
Name: dob, Length: 18278, dtype: int32

In [588]:
# select players that were born after 1990

#check data type
df.dtypes
# convert "dob" column to datetime type
df["dob"] = df["dob"].astype("datetime64[ns]")
# query
df.query("dob.dt.year > 1990")

#exercise: boolean slicing equivalent
df[df['dob'].dt.year > 1990]

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid
K. De Bruyne,Kevin De Bruyne,28,1991-06-28,181,70,Belgium,Manchester City
M. ter Stegen,Marc-André ter Stegen,27,1992-04-30,187,85,Germany,FC Barcelona
...,...,...,...,...,...,...,...
Shao Shuai,邵帅,22,1997-03-10,186,79,China PR,Beijing Renhe FC
Xiao Mingjie,Mingjie Xiao,22,1997-01-01,177,66,China PR,Shanghai SIPG FC
Zhang Wei,张威,19,2000-05-16,186,75,China PR,Hebei China Fortune FC
Wang Haijian,汪海健,18,2000-08-02,185,74,China PR,Shanghai Greenland Shenhua FC


# 7 apply() 

In [589]:
df = pd.read_csv('csv files/players_20.csv')
df.set_index('short_name', inplace=True)
df = df[["long_name", "age", "dob", "height_cm", "weight_kg", "nationality","club"]]

In [591]:
'''
used to apply a function to a DataFrame or Series element-wise, row-wise, or column-wise.
'''

# use numpy function and apply it to series
df['age'].apply(np.sqrt)

short_name
L. Messi             5.656854
Cristiano Ronaldo    5.830952
Neymar Jr            5.196152
J. Oblak             5.099020
E. Hazard            5.291503
                       ...   
Shao Shuai           4.690416
Xiao Mingjie         4.690416
Zhang Wei            4.358899
Wang Haijian         4.242641
Pan Ximing           5.099020
Name: age, Length: 18278, dtype: float64

In [595]:
# create your own function and apply it to a dataframe

def calculate_bmi(dataframe):
    return dataframe['weight_kg'] / (dataframe['height_cm']/100)**2

df.apply(calculate_bmi, axis=1)

short_name
L. Messi             24.913495
Cristiano Ronaldo    23.735308
Neymar Jr            22.204082
J. Oblak             24.615211
E. Hazard            24.163265
                       ...    
Shao Shuai           22.835010
Xiao Mingjie         21.066743
Zhang Wei            21.678807
Wang Haijian         21.621622
Pan Ximing           23.547881
Length: 18278, dtype: float64

# 8 Lambda function 

In [596]:
df = pd.read_csv('csv files/players_20.csv')
df.set_index('short_name', inplace=True)
df = df[["long_name", "age", "dob", "height_cm", "weight_kg", "nationality","club"]]

## 8.1 Basics

In [597]:
# basic function
def sum_values(a,b):
    return a+b

In [599]:
sum_values(2,3)

5

In [600]:
# lambda function (one liner)
sum_values_lambda = lambda a,b: a+b

In [605]:
sum_values_lambda(2,3)

5

## 8.2 apply() + Lambda function 

In [615]:
# use lambda function to convert "height_cm" series to meters
df['height_cm'].apply(lambda x: x/100)

#alternative
df['height_cm']/100

short_name
L. Messi             1.70
Cristiano Ronaldo    1.87
Neymar Jr            1.75
J. Oblak             1.88
E. Hazard            1.75
                     ... 
Shao Shuai           1.86
Xiao Mingjie         1.77
Zhang Wei            1.86
Wang Haijian         1.85
Pan Ximing           1.82
Name: height_cm, Length: 18278, dtype: float64

In [620]:
# use lambda to convert "long_name" series to upper case
df['long_name'].apply(lambda a: a.upper())

#alternative with str attribute
'''
cannot just do df['long_name'].upper() because df['long_name'] is a series 
and not a string unlike 'a' in lambda a: a.upper()
'''

df['long_name'].str.upper()

short_name
L. Messi                  LIONEL ANDRÉS MESSI CUCCITTINI
Cristiano Ronaldo    CRISTIANO RONALDO DOS SANTOS AVEIRO
Neymar Jr                  NEYMAR DA SILVA SANTOS JUNIOR
J. Oblak                                       JAN OBLAK
E. Hazard                                    EDEN HAZARD
                                    ...                 
Shao Shuai                                            邵帅
Xiao Mingjie                                MINGJIE XIAO
Zhang Wei                                             张威
Wang Haijian                                         汪海健
Pan Ximing                                           潘喜明
Name: long_name, Length: 18278, dtype: object

In [623]:
# use lambda to get the year or "dob" of series
df["dob"] = df["dob"].astype("datetime64[ns]")
df["dob"].apply(lambda x: x.year)

#alternative can be found in 6 Filter a Dataframe: qeuery()

short_name
L. Messi             1987
Cristiano Ronaldo    1985
Neymar Jr            1992
J. Oblak             1993
E. Hazard            1991
                     ... 
Shao Shuai           1997
Xiao Mingjie         1997
Zhang Wei            2000
Wang Haijian         2000
Pan Ximing           1993
Name: dob, Length: 18278, dtype: int64

In [627]:
# apply lambda function to a dataframe to calculate bmi
df.apply(lambda x:x['weight_kg']/(x['height_cm']/100)**2, axis=1)

short_name
L. Messi             24.913495
Cristiano Ronaldo    23.735308
Neymar Jr            22.204082
J. Oblak             24.615211
E. Hazard            24.163265
                       ...    
Shao Shuai           22.835010
Xiao Mingjie         21.066743
Zhang Wei            21.678807
Wang Haijian         21.621622
Pan Ximing           23.547881
Length: 18278, dtype: float64

# 9 Copy a Dataframe: copy() 

In [628]:
df = pd.read_csv('csv files/players_20.csv')
df.set_index('short_name', inplace=True)
df = df[["long_name", "age", "dob", "height_cm", "weight_kg", "nationality","club"]]

## 9.1 deep = True 

In [639]:
# deep=True by default 
# (modifications made to original dataframe will not be reflected in the coy and vice versa)
df_copy = df.copy()
df.loc["L. Messi", "height_cm"] = 190

In [642]:
df.head()
df_copy.head()

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,200,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid


## 9.2 deep = False 

In [632]:
# any modifications made to the original dataframe will be reflected on the copy
df_copy = df.copy(deep=False)

In [633]:
df.loc["Cristiano Ronaldo", "height_cm"] = 200

In [636]:
df.head()
df_copy.head()

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,200,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid


## 9.3 Simple assignment (equivalent to deep=False)

In [648]:
# make a copy
df_new_copy = df

In [644]:
# update value in original datframe
df.loc["Neymar Jr", "height_cm"] = 190

In [649]:
df.head()
df_new_copy.head()

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,190,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,200,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,190,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid
