In [43]:
import pandas as pd
import numpy as np

import sklearn

# data splitting train dan test
from sklearn.model_selection import train_test_split
# normalisasi
from sklearn.preprocessing import MinMaxScaler
# standarisasi
from sklearn.preprocessing import StandardScaler
# Imputer untuk data cleaning, 
from sklearn.impute import SimpleImputer
# untuk encoding
from sklearn.preprocessing import OneHotEncoder

In [44]:
df = pd.read_csv('employee_data.csv')
df.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,status,tenure
0,221.0,engineering,,0.932868,4.0,,low,0.829896,Left,5.0
1,232.0,support,,,,,low,0.834544,Employed,2.0
2,184.0,sales,,0.78883,3.0,,medium,0.834988,Employed,3.0
3,206.0,sales,,0.575688,4.0,,low,0.424764,Employed,2.0
4,249.0,sales,,0.845217,3.0,,low,0.779043,Employed,3.0


### Data Split

Sebelum mensplit data, Tentukan x dan y.

- x merupakan data frame tanpa label.
- y adalah labelnya.

In [45]:
x = df.drop('status', 1)
y = df.status

  """Entry point for launching an IPython kernel.


Data Splitting test nya 30% dari total dataset

In [46]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [47]:
# menghitung panjang/jumlah data pada x_test
print(len(x_test))
print(len(x_train))

4275
9974


### Normalisasi

In [48]:
min_max_scaler = MinMaxScaler()
df_copy = df.filter(
  [
  'avg_monthly_hrs', 
  'last_evaluation', 
  'n_projects', 
  'recently_promoted', 
  'satisfaction', 
  'tenure'
  ]
)
scaled_data = min_max_scaler.fit_transform(df_copy)
scaled_data

array([[0.65900383, 0.90182821, 0.5       ,        nan, 0.82279776,
        0.375     ],
       [0.70114943,        nan,        nan,        nan, 0.8276393 ,
        0.        ],
       [0.51724138, 0.69119286, 0.33333333,        nan, 0.82810164,
        0.125     ],
       ...,
       [0.70114943, 0.77716957,        nan,        nan, 0.89469927,
        0.375     ],
       [0.31034483, 0.64491448, 0.5       ,        nan, 0.62633584,
        0.125     ],
       [0.42145594, 0.38396854,        nan,        nan, 0.80087379,
        0.125     ]])

### Standarisasi

In [49]:
standard_scaler = StandardScaler()
df_copy = df.filter(
  [
  'avg_monthly_hrs', 
  'last_evaluation', 
  'n_projects', 
  'recently_promoted', 
  'satisfaction', 
  'tenure'
  ]
)
scaled_data_standard = standard_scaler.fit_transform(df_copy)
scaled_data_standard

array([[ 0.4154474 ,  1.2388591 ,  0.18015888,         nan,  0.83280317,
         1.02877285],
       [ 0.63118137,         nan,         nan,         nan,  0.85135706,
        -1.02468463],
       [-0.31020322,  0.40642745, -0.61828277,         nan,  0.85312884,
        -0.3401988 ],
       ...,
       [ 0.63118137,  0.7462077 ,         nan,         nan,  1.108346  ,
         1.02877285],
       [-1.36926088,  0.22353512,  0.18015888,         nan,  0.07991679,
        -0.3401988 ],
       [-0.80050769, -0.80772416,         nan,         nan,  0.74878561,
        -0.3401988 ]])

### Data Cleaning

Berikut ini adalah jumlah null di dalam dataset dari setiap attribut

In [50]:
df.isna().sum()

avg_monthly_hrs         79
department             708
filed_complaint      12186
last_evaluation       1535
n_projects              67
recently_promoted    13949
salary                   0
satisfaction           182
status                   0
tenure                 185
dtype: int64

- Gunakan strategy Median untuk nilai Integer.
- Gunakan strategy Mean untuk nilai Desimal.
- Gunakan strategy Modus untuk nilai tulisan 

In [51]:
medianImputer = SimpleImputer(strategy='median')
meanImputer = SimpleImputer(strategy='mean')
modusImputer = SimpleImputer(strategy='most_frequent')
df["avg_monthly_hrs"] = medianImputer.fit_transform(df[["avg_monthly_hrs"]])
df["department"] = modusImputer.fit_transform(df[["department"]])
df["filed_complaint"] = medianImputer.fit_transform(df[["filed_complaint"]])
df["last_evaluation"] = meanImputer.fit_transform(df[["last_evaluation"]])
df["n_projects"] = medianImputer.fit_transform(df[["n_projects"]])
df["recently_promoted"] = medianImputer.fit_transform(df[["recently_promoted"]])
df["satisfaction"] = meanImputer.fit_transform(df[["satisfaction"]])
df["tenure"] = medianImputer.fit_transform(df[["tenure"]])

berikut ini setelah data di cleaning

In [52]:
df.isna().sum()

avg_monthly_hrs      0
department           0
filed_complaint      0
last_evaluation      0
n_projects           0
recently_promoted    0
salary               0
satisfaction         0
status               0
tenure               0
dtype: int64

Bersihkan data duplikat


In [53]:
df[df.duplicated()].sum()

avg_monthly_hrs                                                 6449.0
department           supportadminmarketingsalessalessalessalestempt...
filed_complaint                                                   48.0
last_evaluation                                               34.63956
n_projects                                                       109.0
recently_promoted                                                 48.0
salary               mediumlowmediumlowlowhighmediumlowlowlowmedium...
satisfaction                                                 29.320551
status               EmployedEmployedEmployedLeftLeftEmployedEmploy...
tenure                                                           141.0
dtype: object

In [54]:
df.drop_duplicates(inplace=True)

berikut setelah data duplikat dibersihkan

In [55]:
df[df.duplicated()].sum()

avg_monthly_hrs      0.0
department           0.0
filed_complaint      0.0
last_evaluation      0.0
n_projects           0.0
recently_promoted    0.0
salary               0.0
satisfaction         0.0
status               0.0
tenure               0.0
dtype: float64

mengubah tipe data dari attribut

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14201 entries, 0 to 14248
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   avg_monthly_hrs    14201 non-null  float64
 1   department         14201 non-null  object 
 2   filed_complaint    14201 non-null  float64
 3   last_evaluation    14201 non-null  float64
 4   n_projects         14201 non-null  float64
 5   recently_promoted  14201 non-null  float64
 6   salary             14201 non-null  object 
 7   satisfaction       14201 non-null  float64
 8   status             14201 non-null  object 
 9   tenure             14201 non-null  float64
dtypes: float64(7), object(3)
memory usage: 1.2+ MB


In [57]:
df['avg_monthly_hrs'] = df['avg_monthly_hrs'].astype('int64')
df['tenure'] = df['tenure'].astype('int64')

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14201 entries, 0 to 14248
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   avg_monthly_hrs    14201 non-null  int64  
 1   department         14201 non-null  object 
 2   filed_complaint    14201 non-null  float64
 3   last_evaluation    14201 non-null  float64
 4   n_projects         14201 non-null  float64
 5   recently_promoted  14201 non-null  float64
 6   salary             14201 non-null  object 
 7   satisfaction       14201 non-null  float64
 8   status             14201 non-null  object 
 9   tenure             14201 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 1.2+ MB


### Encoding

mengubah kategori dari tulisan menjadi angka

In [59]:
one_hot_encoder = OneHotEncoder(sparse=False)

departmentEnc = one_hot_encoder.fit_transform(df[["department"]])
salaryEnc = one_hot_encoder.fit_transform(df[["salary"]])
statusEnc = one_hot_encoder.fit_transform(df[["status"]])

encoded_df = pd.DataFrame([[departmentEnc, salaryEnc, statusEnc]])
df.join(encoded_df)
df.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,status,tenure
0,221,engineering,1.0,0.932868,4.0,1.0,low,0.829896,Left,5
1,232,support,1.0,0.718505,4.0,1.0,low,0.834544,Employed,2
2,184,sales,1.0,0.78883,3.0,1.0,medium,0.834988,Employed,3
3,206,sales,1.0,0.575688,4.0,1.0,low,0.424764,Employed,2
4,249,sales,1.0,0.845217,3.0,1.0,low,0.779043,Employed,3
