# Data Cleaning

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 1. Missing values
#     Dropping
#     Replacing
# 2. Data formating
# 3. Data normalization
# 4. Dealing with categorical values

In [3]:
data = {
    'Jina': ['Rahma', np.nan, 'Juma'],
    'Umri': [np.nan, 23, 26],
    'Jinsia': ['Female', np.nan, 'Male']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Jina,Umri,Jinsia
0,Rahma,,Female
1,,23.0,
2,Juma,26.0,Male


In [4]:
# dropping

In [None]:
df.dropna(subset=['Umri'], axis=0)

In [7]:
# replacing

In [9]:
mean = df['Umri'].mean()
df['Umri'] = df['Umri'].replace(np.nan, mean)
df

Unnamed: 0,Jina,Umri,Jinsia
0,Rahma,24.5,Female
1,,23.0,
2,Juma,26.0,Male


In [10]:
# Data formatting

In [11]:
country = {'Country': ['Tanzania', 'tz', 'Tz']}
df = pd.DataFrame(country)
df

Unnamed: 0,Country
0,Tanzania
1,tz
2,Tz


In [13]:
corrections = {'tz': 'Tanzania', 'Tz': 'Tanzania'}
df.Country = df.Country.replace(corrections)
df

Unnamed: 0,Country
0,Tanzania
1,Tanzania
2,Tanzania


In [14]:
df.dtypes

Country    object
dtype: object

In [15]:
# Dealing with Categirical variables

In [17]:
idadi = {'idadi': ['moja', 'mbili', 'tatu', 'moja', 'tatu']}
df = pd.DataFrame(idadi)
df

Unnamed: 0,idadi
0,moja
1,mbili
2,tatu
3,moja
4,tatu


In [18]:
df.idadi.value_counts()

tatu     2
moja     2
mbili    1
Name: idadi, dtype: int64

In [19]:
# Njia ya 1: Find and Replace

In [20]:
corrections = {'moja':1, 'mbili': 2, 'tatu': 3}
df.idadi = df.idadi.replace(corrections)
df

Unnamed: 0,idadi
0,1
1,2
2,3
3,1
4,3


In [21]:
# Njia ya 2: Label Encoding

In [22]:
country = {'Country': ['Tanzania', 'Kenya', 'Uganda', 'Tanzania']}
df = pd.DataFrame(country)
df

Unnamed: 0,Country
0,Tanzania
1,Kenya
2,Uganda
3,Tanzania


In [23]:
df.value_counts()

Country 
Tanzania    2
Uganda      1
Kenya       1
dtype: int64

In [24]:
# Kenya:0
# Tanzania:1
# Uganda:2

In [25]:
df.dtypes

Country    object
dtype: object

In [26]:
df['Country'] = df['Country'].astype('category')

In [27]:
df.dtypes

Country    category
dtype: object

In [28]:
df['Country'].cat.codes

0    1
1    0
2    2
3    1
dtype: int8

In [30]:
df['Country'] = df['Country'].cat.codes

In [31]:
df

Unnamed: 0,Country
0,1
1,0
2,2
3,1


In [32]:
# Data Normalization
# Feature Scaling
# Min-Max
# Z-score

In [33]:
df = pd.DataFrame({'number': [290, 10, 30, 500]})
df

Unnamed: 0,number
0,290
1,10
2,30
3,500


In [34]:
df['number'].max()

500

In [35]:
# Feature Scaling
df['number'] = df['number'] / df['number'].max()
df

Unnamed: 0,number
0,0.58
1,0.02
2,0.06
3,1.0


In [36]:
# Min-Max
df['number'] = (df['number'] - df['number'].min()) / (df['number'].max() - df['number'].min())
df

Unnamed: 0,number
0,0.571429
1,0.0
2,0.040816
3,1.0


In [38]:
# Z-score
mean = df['number'].mean()
mean

0.4030612244897959

In [39]:
std = df['number'].std()
std

0.4755218824293637

In [40]:
df['number'] = (df['number'] - mean) / std
df

Unnamed: 0,number
0,0.354069
1,-0.847619
2,-0.761784
3,1.255334
