# **LATIHAN PREPROCESSING DATA DENGAN MISSING VALUES**

**Import Library**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('Job_Placement_Data.csv')

In [3]:
print(dataset.head())

  gender  ssc_percentage ssc_board  hsc_percentage hsc_board hsc_subject  \
0      M           67.00    Others           91.00    Others    Commerce   
1      M           79.33   Central           78.33    Others     Science   
2      M           65.00   Central             NaN   Central        Arts   
3      M           56.00   Central           52.00   Central     Science   
4      M           85.80   Central           73.60   Central    Commerce   

   degree_percentage undergrad_degree work_experience  emp_test_percentage  \
0              58.00         Sci&Tech              No                 55.0   
1              77.48         Sci&Tech             Yes                 86.5   
2              64.00        Comm&Mgmt              No                 75.0   
3              52.00         Sci&Tech              No                 66.0   
4              73.30        Comm&Mgmt              No                 96.8   

  specialisation  mba_percent      status  
0         Mkt&HR        58.80 

In [4]:
# Memisahkan fitur (X) dan target (y)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [5]:
print(X)

[['M' 67.0 'Others' ... 55.0 'Mkt&HR' 58.8]
 ['M' 79.33 'Central' ... 86.5 'Mkt&Fin' 66.28]
 ['M' 65.0 'Central' ... 75.0 'Mkt&Fin' 57.8]
 ...
 ['F' nan 'Others' ... 59.32 'Mkt&HR' 69.71]
 ['F' 67.0 'Central' ... 88.0 'Mkt&HR' 71.96]
 ['M' 69.0 'Others' ... 73.0 'Mkt&HR' 55.8]]


In [6]:
print(y)

['Placed' 'Placed' 'Placed' 'Not Placed' 'Placed' 'Not Placed'
 'Not Placed' 'Placed' 'Placed' 'Not Placed' 'Placed' 'Placed'
 'Not Placed' 'Placed' 'Not Placed' 'Placed' 'Placed' 'Not Placed'
 'Not Placed' 'Placed' 'Placed' 'Placed' 'Placed' 'Placed' 'Placed'
 'Not Placed' 'Placed' 'Placed' 'Placed' 'Not Placed' 'Placed'
 'Not Placed' 'Placed' 'Placed' 'Not Placed' 'Placed' 'Not Placed'
 'Placed' 'Placed' 'Placed' 'Placed' 'Not Placed' 'Not Placed' 'Placed'
 'Placed' 'Not Placed' 'Not Placed' 'Placed' 'Placed' 'Not Placed'
 'Placed' 'Not Placed' 'Not Placed' 'Placed' 'Placed' 'Placed' 'Placed'
 'Placed' 'Placed' 'Placed' 'Placed' 'Placed' 'Placed' 'Not Placed'
 'Placed' 'Not Placed' 'Placed' 'Placed' 'Not Placed' 'Placed' 'Placed'
 'Placed' 'Placed' 'Placed' 'Placed' 'Not Placed' 'Placed' 'Placed'
 'Placed' 'Not Placed' 'Placed' 'Placed' 'Not Placed' 'Placed' 'Placed'
 'Placed' 'Placed' 'Not Placed' 'Placed' 'Placed' 'Placed' 'Not Placed'
 'Placed' 'Not Placed' 'Placed' 'Placed' 'Plac

In [7]:
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               200 non-null    object 
 1   ssc_percentage       192 non-null    float64
 2   ssc_board            200 non-null    object 
 3   hsc_percentage       192 non-null    float64
 4   hsc_board            200 non-null    object 
 5   hsc_subject          200 non-null    object 
 6   degree_percentage    192 non-null    float64
 7   undergrad_degree     200 non-null    object 
 8   work_experience      200 non-null    object 
 9   emp_test_percentage  192 non-null    float64
 10  specialisation       200 non-null    object 
 11  mba_percent          192 non-null    float64
 12  status               200 non-null    object 
dtypes: float64(5), object(8)
memory usage: 20.4+ KB
None


# Menghilangkan Missing Value

In [8]:
# periksa nilai yang hilang di setiap kolom
print(dataset.isnull().sum())

gender                 0
ssc_percentage         8
ssc_board              0
hsc_percentage         8
hsc_board              0
hsc_subject            0
degree_percentage      8
undergrad_degree       0
work_experience        0
emp_test_percentage    8
specialisation         0
mba_percent            8
status                 0
dtype: int64


In [9]:
from sklearn.impute import SimpleImputer
kolom = 1, 3, 6, 9, 11
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, kolom])
X[:, kolom] = imputer.transform(X[:, kolom])

In [10]:
# Tampilkan hasil
print(X)

[['M' 67.0 'Others' ... 55.0 'Mkt&HR' 58.8]
 ['M' 79.33 'Central' ... 86.5 'Mkt&Fin' 66.28]
 ['M' 65.0 'Central' ... 75.0 'Mkt&Fin' 57.8]
 ...
 ['F' 67.46333333333332 'Others' ... 59.32 'Mkt&HR' 69.71]
 ['F' 67.0 'Central' ... 88.0 'Mkt&HR' 71.96]
 ['M' 69.0 'Others' ... 73.0 'Mkt&HR' 55.8]]


# Encoding Data Kategori (Atribut)

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Ubah kolom bertipe string ke numerik menggunakan OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 2, 4, 5, 7, 8, 10])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [12]:
print(X)

[[0.0 1.0 0.0 ... 58.0 55.0 58.8]
 [0.0 1.0 1.0 ... 77.48 86.5 66.28]
 [0.0 1.0 1.0 ... 64.0 75.0 57.8]
 ...
 [1.0 0.0 0.0 ... 91.0 59.32 69.71]
 [1.0 0.0 1.0 ... 65.0 88.0 71.96]
 [0.0 1.0 0.0 ... 57.0 73.0 55.8]]


# Encoding Data Kategori (Label)

In [13]:
from sklearn.preprocessing import LabelEncoder
# Encoding target "status" menggunakan LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [14]:
print(y)

[1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 0 1 0
 1 1 1 1 0 0 1 1 0 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1
 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1 1 1 1 0 0 1 1 0 1
 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0 1 0 0 0 0 1 1 0 1 0 1 1 1 0 1 0 0 1 0
 1 0 1 0 0 0 1 1 1 0 1 1 1 0 1]


# Membagi dataset ke dalam training set dan test set

In [15]:
from sklearn.model_selection import train_test_split

# test_size = 0.2 berarti 20% dari data akan digunakan untuk pengujian
# random_state = 0 untuk memastikan hasil yang konsisten setiap kali kode dijalankan
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

In [16]:
print(X_train)

[[1.0 0.0 1.0 ... 72.0 94.0 67.13]
 [0.0 1.0 0.0 ... 66.0 68.92 58.46]
 [0.0 1.0 0.0 ... 66.0 94.0 57.55]
 ...
 [0.0 1.0 0.0 ... 73.0 80.0 67.05]
 [0.0 1.0 1.0 ... 57.0 78.0 54.55]
 [0.0 1.0 0.0 ... 56.0 84.0 52.64]]


In [17]:
print(X_test)

[[1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 63.0
  66.0 66.57822916666667 68.0 64.08]
 [1.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 63.4
  67.2 60.0 58.06 69.28]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 82.0
  66.34531249999999 83.0 80.0 73.52]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 69.0
  73.0 65.0 70.0 57.31]
 [1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 73.0
  97.0 79.0 89.0 70.81]
 [0.0 1.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 52.0
  65.0 57.0 75.0 59.81]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 55.0
  49.8 67.25 55.0 51.58]
 [0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0
  67.46333333333332 63.0 66.0 85.0 55.14]
 [1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 47.0
  55.0 65.0 62.0 65.04]
 [1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 75.4
  60.5 84.0 98.0 62.4231250000000

In [18]:
print(y_train)

[1 1 1 1 0 0 1 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0
 0 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1
 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 0 0 0 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 1
 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1
 1 1 0 0 1 0 1 1 1 1 1 1]


In [19]:
print(y_test)

[0 0 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1
 1 1 0]


# Feature Scaling

In [20]:
from sklearn.preprocessing import StandardScaler
# Menerapkan standarisasi pada fitur numerik (mulai dari kolom ke-17)
sc = StandardScaler()
X_train[:, 16:] = sc.fit_transform(X_train[:, 16:]) 
X_test[:, 16:] = sc.transform(X_test[:, 16:])

In [21]:
print(X_train)

[[1.0 0.0 1.0 ... 0.8088117784876246 1.7435019236066194
  0.8919494861898679]
 [0.0 1.0 0.0 ... -0.018678326450726764 -0.21358437770494
  -0.6777831617387449]
 [0.0 1.0 0.0 ... -0.018678326450726764 1.7435019236066194
  -0.8425417210945747]
 ...
 [0.0 1.0 0.0 ... 0.9467267959773499 0.6510295066065462
  0.8774652172355096]
 [0.0 1.0 1.0 ... -1.259913483858254 0.4949620184636787
  -1.3857018068830225]
 [0.0 1.0 0.0 ... -1.3978285013479792 0.9631644828922814
  -1.7315137281683335]]


In [22]:
print(X_test)

[[1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0
  -0.3863711694499753 -0.020844054443394024 0.06106815918317679
  -0.2853754222506592 0.33973673230494666]
 [1.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0
  -0.34694072007829657 0.08730050648771334 -0.8461684313890783
  -1.061030838320711 1.2812142143382566]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0
  1.4865751757047705 0.010275669470374312 2.3258769708746025
  0.6510295066065462 2.0488804689192617]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0
  0.2050855711252076 0.6099992176547308 -0.15659334394045202
  -0.12930793410779162 -0.8859945279576495]
 [1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0
  0.5993900648419962 2.772890436276873 1.7742169009157014
  1.3533332032494503 1.558225858090365]
 [0.0 1.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0
  -1.470708527171144 -0.11096452188598328 -1.259913483858254
  0.2608607862493773 -0.433