<a href="https://colab.research.google.com/github/sajid-munawar/Data_Cleaning_with_python/blob/main/Standardisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning with Python - Standardisation

## 1. Standardisation

#### <font color="blue">Pre-requisites</font>

In [1]:
# Pre-requisites
# ---
# Importing pandas and numpy library
# ---
# OUR CODE GOES BELOW
# 
import pandas as pd

import numpy as np

#### <font color="blue">Examples</font>

##### <font color="blue">Example 1</font>

In [2]:
# Example 1
# --- 
# Renaming column names
# ---
# Dataset url = http://bit.ly/SampleDataset
# ---
# OUR CODE GOES BELOW
# 

# Reading our dataset from the url 
# ---
# 
df = pd.read_csv('http://bit.ly/SampleDataset')
df.head()
 

Unnamed: 0,NAME,CITY,COUNTRY,HEIGHT,WEIGHT,ACCOUNT A,ACCOUNT B,TOTAL ACCOUNT
0,Adi Dako,LISBON,PORTUGAL,56,132.0,2390.0,4340,6730
1,John Paul,LONDON,UNITED KINGDOM,62,165.0,4500.0,34334,38834
2,Cindy Jules,Stockholm,Sweden,48,117.0,,5504,8949
3,Arthur Kegels,BRUSSELS,BELGIUM,59,121.0,4344.0,8999,300
4,Freya Bismark,Berlin,GERMANYY,53,126.0,7000.0,19000,26000


In [3]:
df.columns

Index(['NAME', 'CITY', 'COUNTRY', 'HEIGHT', 'WEIGHT', 'ACCOUNT A', 'ACCOUNT B',
       'TOTAL ACCOUNT'],
      dtype='object')

In [4]:
df.columns=df.columns.str.strip().str.lower().str.replace(' ','_')

In [6]:
df.head()

Unnamed: 0,name,city,country,height,weight,account_a,account_b,total_account
0,Adi Dako,LISBON,PORTUGAL,56,132.0,2390.0,4340,6730
1,John Paul,LONDON,UNITED KINGDOM,62,165.0,4500.0,34334,38834
2,Cindy Jules,Stockholm,Sweden,48,117.0,,5504,8949
3,Arthur Kegels,BRUSSELS,BELGIUM,59,121.0,4344.0,8999,300
4,Freya Bismark,Berlin,GERMANYY,53,126.0,7000.0,19000,26000


In [None]:
# Example 1a
# ---
# Renaming the column names
# ---
#
df.columns = ['name', 'city', 'country', 'height', 'weight', 'account_a', 'account_b', 'total_account']
df.head()

In [None]:
# Example 1b
# ---
# Renaming our columns, if we have many column names.
# ---
# 
df1 = pd.read_csv('http://bit.ly/SampleDataset')
df1.head()

In [None]:
# We use the str.strip(), str.lower(), str.replace() functions
# ---
df1.columns = df1.columns.str.strip().str.lower().str.replace(' ', '_')
df1.head()

##### <font color="blue">Example 2</font>

In [7]:
# Example 2
# ---
# String conversion: Convert city column to lower case
# ---
# Dataset url = http://bit.ly/SampleDataset
# ---
# OUR CODE GOES BELOW
# 

# Converting the city column to lowercase 
# ---
# 
df['city'] = df['city'].str.lower()
df.head()

Unnamed: 0,name,city,country,height,weight,account_a,account_b,total_account
0,Adi Dako,lisbon,PORTUGAL,56,132.0,2390.0,4340,6730
1,John Paul,london,UNITED KINGDOM,62,165.0,4500.0,34334,38834
2,Cindy Jules,stockholm,Sweden,48,117.0,,5504,8949
3,Arthur Kegels,brussels,BELGIUM,59,121.0,4344.0,8999,300
4,Freya Bismark,berlin,GERMANYY,53,126.0,7000.0,19000,26000


##### <font color="blue">Example 3</font>

In [8]:
# Example 3
# ---
# Metric conversion: Height inches to cm
# Hint: 1 inch = 2.54 cm
# ---
# Dataset url = http://bit.ly/SampleDataset
# ---
# 

df['height'] = df['height'] * 2.54
df.head()

Unnamed: 0,name,city,country,height,weight,account_a,account_b,total_account
0,Adi Dako,lisbon,PORTUGAL,142.24,132.0,2390.0,4340,6730
1,John Paul,london,UNITED KINGDOM,157.48,165.0,4500.0,34334,38834
2,Cindy Jules,stockholm,Sweden,121.92,117.0,,5504,8949
3,Arthur Kegels,brussels,BELGIUM,149.86,121.0,4344.0,8999,300
4,Freya Bismark,berlin,GERMANYY,134.62,126.0,7000.0,19000,26000


In [9]:
# Determining the datatypes
# ---
# 
df.dtypes

name              object
city              object
country           object
height           float64
weight           float64
account_a        float64
account_b          int64
total_account      int64
dtype: object

In [10]:
# Performing our conversion
# ---
# 
df['height'] = df['height'].apply(np.int64)

# Confirming our conversion
# ---
# 
df.dtypes

name              object
city              object
country           object
height             int64
weight           float64
account_a        float64
account_b          int64
total_account      int64
dtype: object

#### <font color="green">Challenges</font> 

##### <font color="green">Challenge 1</font>

In [12]:
# Challenge 1
# ---
# Question: Rename the columns in the following dataset.
# ---
# Dataset url = http://bit.ly/GVProjectsFunding
# ---
# OUR CODE GOES BELOW
df=pd.read_csv('http://bit.ly/GVProjectsFunding')
df.head()

Unnamed: 0,Total_-_GOK_Budget_Est_KES,Total_-_Loan_Budget_Est_KES,Total_-_Grant_Budget_Est_KES,Total_Budget_Supported__by_Donors_KES,Total_Project_Cost_KES,Funding_Source,OBJECTID
0,111285239,609690025,2100000000,2709690025,25162065397,Government of Sweden,0
1,2009277440,7443790000,1436627037,8880417037,12567296534,United Nations Development Programme (UNDP),1
2,50034000,0,1705540000,1705540000,4015101726,World Food Programme,2
3,24267555932,144199000000,1700000000,145899000000,445033000000,African Development Bank/ Fund,3
4,1160000000,2085000000,600000000,2685000000,174293000000,European Investment Bank,4


In [14]:
df.columns=df.columns.str.strip().str.lower().str.replace('-','')
df.head()

Unnamed: 0,total__gok_budget_est_kes,total__loan_budget_est_kes,total__grant_budget_est_kes,total_budget_supported__by_donors_kes,total_project_cost_kes,funding_source,objectid
0,111285239,609690025,2100000000,2709690025,25162065397,Government of Sweden,0
1,2009277440,7443790000,1436627037,8880417037,12567296534,United Nations Development Programme (UNDP),1
2,50034000,0,1705540000,1705540000,4015101726,World Food Programme,2
3,24267555932,144199000000,1700000000,145899000000,445033000000,African Development Bank/ Fund,3
4,1160000000,2085000000,600000000,2685000000,174293000000,European Investment Bank,4


##### <font color="green">Challenge 2</font>

In [21]:
# Challenge 2
# ---
# Question: Convert the given weight in the dataset from pounds to kgs. 
# Hint: 1 pound = 0.453592 kgs
# ---
# Dataset url = http://bit.ly/SampleDataset
# ---
# OUR CODE GOES BELOW
df=pd.read_csv('http://bit.ly/SampleDataset')
df.head()



Unnamed: 0,NAME,CITY,COUNTRY,HEIGHT,WEIGHT,ACCOUNT A,ACCOUNT B,TOTAL ACCOUNT
0,Adi Dako,LISBON,PORTUGAL,56,132.0,2390.0,4340,6730
1,John Paul,LONDON,UNITED KINGDOM,62,165.0,4500.0,34334,38834
2,Cindy Jules,Stockholm,Sweden,48,117.0,,5504,8949
3,Arthur Kegels,BRUSSELS,BELGIUM,59,121.0,4344.0,8999,300
4,Freya Bismark,Berlin,GERMANYY,53,126.0,7000.0,19000,26000


In [22]:
df['WEIGHT']=df['WEIGHT']*0.453592
df.head()

Unnamed: 0,NAME,CITY,COUNTRY,HEIGHT,WEIGHT,ACCOUNT A,ACCOUNT B,TOTAL ACCOUNT
0,Adi Dako,LISBON,PORTUGAL,56,59.874144,2390.0,4340,6730
1,John Paul,LONDON,UNITED KINGDOM,62,74.84268,4500.0,34334,38834
2,Cindy Jules,Stockholm,Sweden,48,53.070264,,5504,8949
3,Arthur Kegels,BRUSSELS,BELGIUM,59,54.884632,4344.0,8999,300
4,Freya Bismark,Berlin,GERMANYY,53,57.152592,7000.0,19000,26000


##### <font color="green">Challenge 3</font>

In [23]:
# Challenge 3
# ---
# Question: Convert the variables account_a and account_b to the datatype float
# ---
# Dataset url = http://bit.ly/SampleDataset
# ---
# OUR CODE GOES BELOW
df=pd.read_csv('http://bit.ly/SampleDataset')
df.head()


Unnamed: 0,NAME,CITY,COUNTRY,HEIGHT,WEIGHT,ACCOUNT A,ACCOUNT B,TOTAL ACCOUNT
0,Adi Dako,LISBON,PORTUGAL,56,132.0,2390.0,4340,6730
1,John Paul,LONDON,UNITED KINGDOM,62,165.0,4500.0,34334,38834
2,Cindy Jules,Stockholm,Sweden,48,117.0,,5504,8949
3,Arthur Kegels,BRUSSELS,BELGIUM,59,121.0,4344.0,8999,300
4,Freya Bismark,Berlin,GERMANYY,53,126.0,7000.0,19000,26000


In [24]:
df.dtypes

NAME              object
CITY              object
COUNTRY           object
HEIGHT             int64
WEIGHT           float64
ACCOUNT A        float64
ACCOUNT B          int64
TOTAL ACCOUNT      int64
dtype: object

In [27]:
df['ACCOUNT A']=df['ACCOUNT A'].apply(np.float64)
df['ACCOUNT B']=df['ACCOUNT B'].apply(np.float64)

In [28]:
df.dtypes

NAME              object
CITY              object
COUNTRY           object
HEIGHT             int64
WEIGHT           float64
ACCOUNT A        float64
ACCOUNT B        float64
TOTAL ACCOUNT      int64
dtype: object