In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## 1. *df.replace()* function

### 1.1 Eamples

In [24]:
df = pd.read_csv('/content/drive/MyDrive/world_happiness_dataset.csv')
df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [25]:
df_world = df[['Country or region','Score']]
df_world.head()

Unnamed: 0,Country or region,Score
0,Finland,7.769
1,Denmark,7.6
2,Norway,7.554
3,Iceland,7.494
4,Netherlands,7.488


In [26]:
df_world = df_world.replace('Finland','Vinland')
df_world.head()

Unnamed: 0,Country or region,Score
0,Vinland,7.769
1,Denmark,7.6
2,Norway,7.554
3,Iceland,7.494
4,Netherlands,7.488


In [27]:
df_world = df_world.replace(['Vinland','Denmark','Norway'],'Scandinavia')
df_world.head()

Unnamed: 0,Country or region,Score
0,Scandinavia,7.769
1,Scandinavia,7.6
2,Scandinavia,7.554
3,Iceland,7.494
4,Netherlands,7.488


In [28]:
df_world['Score 1'] = (df_world['Score'] > 7.5)
df_world['Score 1'] = df_world['Score 1'].replace([True,False],[0,1])
df_world.head()

Unnamed: 0,Country or region,Score,Score 1
0,Scandinavia,7.769,0
1,Scandinavia,7.6,0
2,Scandinavia,7.554,0
3,Iceland,7.494,1
4,Netherlands,7.488,1


### 1.2 Cases where the replace function lacks
- **When multiple values need to be replaced**. In this case, the *replace* function creates tedious and inefficient code. 
- **When a continuous range of values need to be replaced**
- **When values fitting a certain condition need to be replaced**. Since the specific values would be unknown in this case, the *replace* function would not be too useful. 

## 2. *np.where()* function

In [30]:
df = pd.read_csv('/content/drive/MyDrive/world_happiness_dataset.csv')
df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [31]:
df_world = df[['Country or region','Score']]
df_world.head()

Unnamed: 0,Country or region,Score
0,Finland,7.769
1,Denmark,7.6
2,Norway,7.554
3,Iceland,7.494
4,Netherlands,7.488


In [32]:
df_world['Score 1'] = np.where(df_world['Score'] > 7.5,1,0)
df_world.head()

Unnamed: 0,Country or region,Score,Score 1
0,Finland,7.769,1
1,Denmark,7.6,1
2,Norway,7.554,1
3,Iceland,7.494,0
4,Netherlands,7.488,0


## 3. *df.between()* function

In [33]:
df = pd.read_csv('/content/drive/MyDrive/world_happiness_dataset.csv')
df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [34]:
df_world = df[['Country or region','Score']]
df_world.head()

Unnamed: 0,Country or region,Score
0,Finland,7.769
1,Denmark,7.6
2,Norway,7.554
3,Iceland,7.494
4,Netherlands,7.488


In [35]:
df_world['Score 1'] = np.where(df_world['Score'].between(7.5,7.6),1,0)
df_world.head()

Unnamed: 0,Country or region,Score,Score 1
0,Finland,7.769,0
1,Denmark,7.6,1
2,Norway,7.554,1
3,Iceland,7.494,0
4,Netherlands,7.488,0


## 4. *df.mask()* function

In [36]:
df = pd.read_csv('/content/drive/MyDrive/world_happiness_dataset.csv')
df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [37]:
df_world = df[['Country or region','Score']]
df_world.head()

Unnamed: 0,Country or region,Score
0,Finland,7.769
1,Denmark,7.6
2,Norway,7.554
3,Iceland,7.494
4,Netherlands,7.488


In [38]:
df_world['Score 1'] = df_world['Score'].mask(df_world['Score'] < 7.5,0)
df_world.head()

Unnamed: 0,Country or region,Score,Score 1
0,Finland,7.769,7.769
1,Denmark,7.6,7.6
2,Norway,7.554,7.554
3,Iceland,7.494,0.0
4,Netherlands,7.488,0.0


In [39]:
df_world['Score 1'] = df_world['Score 1'].mask(df_world['Score 1'] > 7.5,1)
df_world.head()

Unnamed: 0,Country or region,Score,Score 1
0,Finland,7.769,1.0
1,Denmark,7.6,1.0
2,Norway,7.554,1.0
3,Iceland,7.494,0.0
4,Netherlands,7.488,0.0


## 5. Replace, Mask, Where and Between functions - More examples

https://stackoverflow.com/questions/46168450/replace-a-specific-range-of-values-in-a-pandas-dataframe

In [15]:
df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
                   'B': [5, 6, 7, 8, 9],
                   'C': ['a', 'b', 'c', 'd', 'e']})

df

Unnamed: 0,A,B,C
0,0,5,a
1,1,6,b
2,2,7,c
3,3,8,d
4,4,9,e


In [16]:
df.replace({0: 10, 1: 100}) 

Unnamed: 0,A,B,C
0,10,5,a
1,100,6,b
2,2,7,c
3,3,8,d
4,4,9,e


In [17]:
df.replace({'A': 0, 'B': 5}, 100)

Unnamed: 0,A,B,C
0,100,100,a
1,1,6,b
2,2,7,c
3,3,8,d
4,4,9,e


In [18]:
df.replace({'A': {0: 100, 4: 400}})

Unnamed: 0,A,B,C
0,100,5,a
1,1,6,b
2,2,7,c
3,3,8,d
4,400,9,e


In [19]:
new_dict = {'A':[1,2,3,4,5,6,7,8,9],'B':[11,12,13,14,15,16,17,18,19]}
df_dict = pd.DataFrame(new_dict)
df_dict

Unnamed: 0,A,B
0,1,11
1,2,12
2,3,13
3,4,14
4,5,15
5,6,16
6,7,17
7,8,18
8,9,19


In [20]:
df_dict['A'].mask(df_dict['A']>4,100)

0      1
1      2
2      3
3      4
4    100
5    100
6    100
7    100
8    100
Name: A, dtype: int64

In [21]:
np.where(df_dict['A']>4,100,df_dict['A'])

array([  1,   2,   3,   4, 100, 100, 100, 100, 100])

In [22]:
np.where(df_dict['A'].between(3,6),100,df_dict['A'])

array([  1,   2, 100, 100, 100, 100,   7,   8,   9])