In [None]:
import pandas as pd

In [None]:
df_players = pd.read_csv('dataset/players_20.csv')

In [None]:
df_players.set_index('short_name', inplace=True)

# selecto columns
df_players = df_players[['long_name', 'age', 'dob', 'height_cm', 'weight_kg', 'nationality']]

In [None]:
df_players

# **Set new value to a cell**

In [None]:
# update height of L. Messi
df_players.loc['L. Messi', 'height_cm'] = 175

In [None]:
df_players

In [None]:
# Set all values in a column
df_players.loc[:, 'height_cm'] = 190

In [None]:
df_players['height_cm']

In [None]:
# Set new values to one row
df_players.iloc[-1, :]

In [None]:
import numpy as np
# Set null values to last row
df_players.iloc[-1 , :] = np.nan

In [None]:
df_players.iloc[-1 , :]

In [None]:
# Set values to multiple rows and columns
# Set new value of the all items they do match with the names arraylist
df_players.loc[['L. Messi', 'Cristiano Ronaldo'], ['height_cm']] = 175

In [None]:
df_players

In [None]:
# Set value for rows that meet a condition
columns = ['long_name', 'age', 'height_cm']
df_players.loc[df_players['height_cm'] >  180, columns] = 0

In [None]:
df_players

# **Delete columns or rows**

In [None]:
# Delete a row
# Param axis
df_players.drop('L. Messi', axis=0)

# By index
df_players.drop(index=['Cristiano Ronaldo'])

In [None]:
# Delete two or more row and update the original dataframe
df_players.drop(['L. Messi', 'Cristiano Ronaldo'], axis=0, inplace=True)

In [None]:
df_players

In [None]:
# Delete a column
df_players.drop('long_name', axis=1)

df_players.drop(columns=['dob'])

In [None]:
# Delete columns by position or index (delete first two columns)

df_players.drop(df_players.columns[[-1]],  axis=1)

In [None]:
# Delete two or more columns
df_players.drop(['long_name', 'dob'], axis=1, inplace=True)

In [None]:
df_players

## **Chapter 2: method sample() args: n, frac, replace, weights, random_state, axis**

In [None]:
# Get ten random items in the column "nationality"
df_players['nationality'].sample(10, random_state=99)

In [None]:
# Get 20% of random samples from the dataframe
df_players.sample(frac=0.2)

In [None]:
# Increase the sample -> frac>1 (Obs: with replacement must be True)
df_players.sample(frac=2, replace=True)

## **Chapter 3: method query() args: expr, inplace, etc.**

In [None]:
# Select player greater than 34 years old
df_players.query("age >  34")
# exercise: the same using boolean indexing
df_players['age'] > 34

In [None]:
df_players

In [None]:
# Select players greater than 34 year old  and nationality is Italy
df_players.query('age > 34 and nationality == "Italy"')
# boolean indexing
df_players[(df_players['age'] > 34) & (df_players['nationality'] == 'Italy')]

In [None]:
# Add a negation operator to the first example
#df_players.query('not age > 34')['age']

# boolean indexing
df_players[~(df_players['age'] > 34)]['age']

In [None]:
# Convert the height from cm to meters for players older than 30 years
#df_players.query('height_cm/100 > 1.8 and age > 30')[['height_cm', 'age']]
# boolean indexing
df_players[(df_players['height_cm']/100 > 1.8)& (df_players['age'] > 30)][['height_cm', 'age']]

In [None]:
# select player whose born after 1990
# check type of data
#df_players['dob'].dtypes
# convert dob to datetime
df_players['dob'] = df_players['dob'].astype('datetime64[ns, UTC]')
#df_players['dob'].dt.year
#query
#df_players.query('dob.dt.year == 1997')['dob'].dt.year

df_players[df_players['dob'].dt.year > 1998] ['dob'].dt.year


##  **Chapter 4: method .apply() args: func, axis, raw, result_type, etc.**


In [None]:
# Use this function on numpy and set into the series
df_players['age'].apply(np.sqrt)

In [None]:
# create function
# When using apply(axis=1), the function receives a Series (row), not individual parameters
def calc_imc(row) -> float:
    weight_kg = row['weight_kg']
    height_cm = row['height_cm'] / 100
    if height_cm <= 0:
        raise ValueError("Height must be greater than 0.")
    return weight_kg / (height_cm ** 2)

In [None]:
df_players.apply(calc_imc, axis=1)

## **IMPORTANT: Lambda functions are more efficient for simple operations compared to defining a full function with def, especially when the operation is straightforward and used only once.**

In [None]:
# Lamda function to calculate the double of age
df_players['age'].apply(lambda x: x * 2)

In [None]:
# Other example
sum_values = lambda a,b:a+b

In [None]:
sum_values(4, 5)

In [None]:
# Use lambda function to calculate "height_cm" in meters
#df_players['height_cm'].apply(lambda x: x/100)

#Alternative
df_players['height_cm']/100

In [None]:
# Lambda function to convert "long_name" to uppercase
df_players['long_name'].apply(lambda x: x.upper())

# Alternative
#df_players['long_name'].str.upper()

In [None]:
# convert "dob" to year using
df_players['dob'] = df_players['dob'].astype('datetime64[ns, UTC]')

In [None]:
# user lambda function to get the year in "dob" column
df_players['dob'].apply(lambda x: x.year)

#alternative boolean indexing
#df_players['dob'].dt.year

In [None]:
# Use a function to calculate the IMC and lambda to return value
df_players.apply(
    lambda x:x
    ['weight_kg'] / ((x['height_cm']/100) **2), axis=1
)

## **Important method copy args: deep**

In [None]:
# deep=True (default) -> changes made to the new dataframe do not affect the original dataframe
df_players_copy = df_players.copy(deep=False)

In [32]:
# update height_cm in the copy dataframe
df_players.loc['L. Messi', 'height_cm'] = 180

In [None]:
# See the copy
df_players_copy