# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('hepatitis.csv')

In [3]:
data.shape

(155, 11)

In [4]:
data.head(n=5)

Unnamed: 0,Class,Age,Sex,Steroid,Anaroxia,Spleen Pabable,Spiders,Bilirubin,Alk Phospat,Albumin,Protime
0,die,30,female,no,yes,yes,yes,1.0,85.0,4.0,
1,die,50,male,no,yes,yes,yes,0.9,135.0,3.5,
2,die,78,male,yes,yes,yes,yes,0.7,96.0,4.0,
3,die,31,male,,yes,yes,yes,0.7,46.0,4.0,80.0
4,die,34,male,yes,yes,yes,yes,1.0,,4.0,


In [5]:
data.tail(n=5)

Unnamed: 0,Class,Age,Sex,Steroid,Anaroxia,Spleen Pabable,Spiders,Bilirubin,Alk Phospat,Albumin,Protime
150,life,46,male,yes,no,yes,no,7.6,,3.3,50.0
151,die,44,male,yes,yes,yes,yes,0.9,126.0,4.3,
152,die,61,male,no,yes,yes,no,0.8,75.0,4.1,
153,die,53,female,no,yes,no,no,1.5,81.0,4.1,48.0
154,life,43,male,yes,yes,no,no,1.2,100.0,3.1,42.0


In [6]:
data.describe() #descriptive statistics

Unnamed: 0,Age,Bilirubin,Alk Phospat,Albumin,Protime
count,155.0,149.0,126.0,139.0,88.0
mean,41.2,1.427517,105.325397,3.817266,61.852273
std,12.565878,1.212149,51.508109,0.651523,22.875244
min,7.0,0.3,26.0,2.1,0.0
25%,32.0,0.7,74.25,3.4,46.0
50%,39.0,1.0,85.0,4.0,61.0
75%,50.0,1.5,132.25,4.2,76.25
max,78.0,8.0,295.0,6.4,100.0


In [7]:
data['Sex'].value_counts()

male      139
female     16
Name: Sex, dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 11 columns):
Class             155 non-null object
Age               155 non-null int64
Sex               155 non-null object
Steroid           154 non-null object
Anaroxia          154 non-null object
Spleen Pabable    150 non-null object
Spiders           150 non-null object
Bilirubin         149 non-null float64
Alk Phospat       126 non-null float64
Albumin           139 non-null float64
Protime           88 non-null float64
dtypes: float64(4), int64(1), object(6)
memory usage: 13.4+ KB


## Imputation Missing Value

In [9]:
np.sum(data.isnull()) #number of NA for each variable

Class              0
Age                0
Sex                0
Steroid            1
Anaroxia           1
Spleen Pabable     5
Spiders            5
Bilirubin          6
Alk Phospat       29
Albumin           16
Protime           67
dtype: int64

In [11]:
data.isnull().sum()

Class              0
Age                0
Sex                0
Steroid            1
Anaroxia           1
Spleen Pabable     5
Spiders            5
Bilirubin          6
Alk Phospat       29
Albumin           16
Protime           67
dtype: int64

In [12]:
data.isnull().sum().sum()

130

In [10]:
#imputasi missing value data numerik
data['Bilirubin'] = data['Bilirubin'].fillna((data['Bilirubin'].mean())) 
data['Albumin'] = data['Albumin'].fillna((data['Albumin'].median()))
data.isnull().any()

Class             False
Age               False
Sex               False
Steroid            True
Anaroxia           True
Spleen Pabable     True
Spiders            True
Bilirubin         False
Alk Phospat        True
Albumin           False
Protime            True
dtype: bool

In [11]:
# imputasi untuk data numerik yang lain
data['Protime']=data['Protime'].fillna(99)
data['Alk Phospat']=data['Alk Phospat'].fillna(111)
data.isnull().any()

Class             False
Age               False
Sex               False
Steroid            True
Anaroxia           True
Spleen Pabable     True
Spiders            True
Bilirubin         False
Alk Phospat       False
Albumin           False
Protime           False
dtype: bool

In [12]:
#imputasi data kategorik dengan modus
data['Spiders'] = data.fillna(data['Spiders'].value_counts().index[0])
data.isnull().any()

Class             False
Age               False
Sex               False
Steroid            True
Anaroxia           True
Spleen Pabable     True
Spiders           False
Bilirubin         False
Alk Phospat       False
Albumin           False
Protime           False
dtype: bool

In [13]:
#imputasi data kategorik dengan modus sesuai kolom masing2
data = data.apply(lambda x:x.fillna(x.value_counts().index[0]))
data.isnull().any()

Class             False
Age               False
Sex               False
Steroid           False
Anaroxia          False
Spleen Pabable    False
Spiders           False
Bilirubin         False
Alk Phospat       False
Albumin           False
Protime           False
dtype: bool

## Encode Labels

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
# Encode untuk Sex, Steroid, Anaroxia, Spleen Pabable, Spiders tanpa membuat kolom baru
data['Type'] = LabelEncoder().fit_transform(data['Class'])
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
data['Steroid'] = LabelEncoder().fit_transform(data['Steroid'])
data['Anaroxia'] = LabelEncoder().fit_transform(data['Anaroxia'])
data['Spleen Pabable'] = LabelEncoder().fit_transform(data['Spleen Pabable'])
data['Spiders'] = LabelEncoder().fit_transform(data['Spiders'])
data.head()

Unnamed: 0,Class,Age,Sex,Steroid,Anaroxia,Spleen Pabable,Spiders,Bilirubin,Alk Phospat,Albumin,Protime,Type
0,die,30,0,0,1,1,0,1.0,85.0,4.0,99.0,0
1,die,50,1,0,1,1,0,0.9,135.0,3.5,99.0,0
2,die,78,1,1,1,1,0,0.7,96.0,4.0,99.0,0
3,die,31,1,1,1,1,0,0.7,46.0,4.0,80.0,0
4,die,34,1,1,1,1,0,1.0,111.0,4.0,99.0,0


## Standardize

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
data['Age'] = StandardScaler().fit(data[['Age']]).transform(data[['Age']])
data['Bilirubin'] = StandardScaler().fit(data[['Bilirubin']]).transform(data[['Bilirubin']])
data['Alk Phospat'] = StandardScaler().fit(data[['Alk Phospat']]).transform(data[['Alk Phospat']])
data['Albumin'] = StandardScaler().fit(data[['Albumin']]).transform(data[['Albumin']])
data['Protime'] = StandardScaler().fit(data[['Protime']]).transform(data[['Protime']])

# Normalize

In [18]:
from sklearn.preprocessing import Normalizer 


In [19]:
data['Age'] = Normalizer().fit(data[['Age']]).transform(data[['Age']])
data['Bilirubin'] = Normalizer().fit(data[['Bilirubin']]).transform(data[['Bilirubin']])
data['Alk Phospat'] = Normalizer().fit(data[['Alk Phospat']]).transform(data[['Alk Phospat']])
data['Albumin'] = Normalizer().fit(data[['Albumin']]).transform(data[['Albumin']])
data['Protime'] = Normalizer().fit(data[['Protime']]).transform(data[['Protime']])

In [20]:
data.head()

Unnamed: 0,Class,Age,Sex,Steroid,Anaroxia,Spleen Pabable,Spiders,Bilirubin,Alk Phospat,Albumin,Protime,Type
0,die,-1.0,0,0,1,1,0,-1.0,-1.0,1.0,1.0,0
1,die,1.0,1,0,1,1,0,-1.0,1.0,-1.0,1.0,0
2,die,1.0,1,1,1,1,0,-1.0,-1.0,1.0,1.0,0
3,die,-1.0,1,1,1,1,0,-1.0,-1.0,1.0,1.0,0
4,die,-1.0,1,1,1,1,0,-1.0,1.0,1.0,1.0,0
