### SKLEARN PIPELINE

In [44]:
"""
Pipeline dalam Scikit-learn adalah alat yang digunakan untuk menyusun langkah-langkah pemrosesan data dan model machine learning secara berurutan dalam satu objek.

SYNTAX:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('step1', transformer1),
    ('step2', transformer2),
    ('model', estimator)
])

"""

import pandas as pd

# dataset sederhana

data = {
    'age': [25, 30, 35, None, 40, 50, None, 60],
    'income': [40000, 50000, None, 60000, 70000, None, 80000, 90000],
    'gender': ['male', 'female', 'female', 'male', None, 'female', 'male', 'female'],
    'purchased': [0, 1, 0, 1, 0, 1, 0, 1]
}

df = pd.DataFrame(data)
df

Unnamed: 0,age,income,gender,purchased
0,25.0,40000.0,male,0
1,30.0,50000.0,female,1
2,35.0,,female,0
3,,60000.0,male,1
4,40.0,70000.0,,0
5,50.0,,female,1
6,,80000.0,male,0
7,60.0,90000.0,female,1


#### 1. PREPROCESSING WITHOUT PIPELINE (MANUAL)

In [45]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import numpy as np

# split features (X) and target (y) 
X = df[['age', 'income', 'gender']]
y = df['purchased']

# split data train 80% and target 20%
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)
print('BEFORE IMPUTER: \n')
print(X_train, '\n\n', X_test)
print('\n')
print('TARGET: \n')
print(y_train, '\n\n', y_test)

BEFORE IMPUTER: 

    age   income  gender
0  25.0  40000.0    male
7  60.0  90000.0  female
2  35.0      NaN  female
4  40.0  70000.0    None
3   NaN  60000.0    male
6   NaN  80000.0    male 

     age   income  gender
1  30.0  50000.0  female
5  50.0      NaN  female


TARGET: 

0    0
7    1
2    0
4    0
3    1
6    0
Name: purchased, dtype: int64 

 1    1
5    1
Name: purchased, dtype: int64


In [51]:
# 1. Imputasi numeric feature
numeric_features = ['age', 'income']
imputer = SimpleImputer(strategy='mean')
X_train[numeric_features] = imputer.fit_transform(X_train[numeric_features])
X_test[numeric_features] = imputer.transform(X_test[numeric_features])
print('AFTER IMPUTER: \n')
print(X_train, '\n\n', X_test)
print('\n')

# 2. Encoding fitur kategorikal
categorical_features = ['gender']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[categorical_features])
X_test_cat = encoder.transform(X_test[categorical_features])
print('CATEGORICAL AFTER IMPUTER: \n')
print(X_train_cat, '\n\n', X_test_cat)
print('\n')

# Gabungkan kembali data numerik dan kategorikal
X_train_processed = np.hstack((X_train[numeric_features].values, X_train_cat))
X_test_processed = np.hstack((X_test[numeric_features].values, X_test_cat))
print('DATA TRAIN AFTER MERGING X : \n')
print(X_train_processed, '\n\n', X_test_processed)
print('\n')

# # 3. Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)
X_test_scaled = scaler.transform(X_test_processed)
print('AFTER SCALING: \n')
print(X_train_scaled, '\n\n', X_test_scaled)
print('\n')

AFTER IMPUTER: 

    age   income  gender
0  25.0  40000.0    male
7  60.0  90000.0  female
2  35.0  68000.0  female
4  40.0  70000.0    None
3  40.0  60000.0    male
6  40.0  80000.0    male 

     age   income  gender
1  30.0  50000.0  female
5  50.0  68000.0  female


CATEGORICAL AFTER IMPUTER: 

[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]] 

 [[1. 0. 0.]
 [1. 0. 0.]]


DATA TRAIN AFTER MERGING X : 

[[2.5e+01 4.0e+04 0.0e+00 1.0e+00 0.0e+00]
 [6.0e+01 9.0e+04 1.0e+00 0.0e+00 0.0e+00]
 [3.5e+01 6.8e+04 1.0e+00 0.0e+00 0.0e+00]
 [4.0e+01 7.0e+04 0.0e+00 0.0e+00 1.0e+00]
 [4.0e+01 6.0e+04 0.0e+00 1.0e+00 0.0e+00]
 [4.0e+01 8.0e+04 0.0e+00 1.0e+00 0.0e+00]] 

 [[3.0e+01 5.0e+04 1.0e+00 0.0e+00 0.0e+00]
 [5.0e+01 6.8e+04 1.0e+00 0.0e+00 0.0e+00]]


AFTER SCALING: 

[[-1.44115338 -1.78280071 -0.70710678  1.         -0.4472136 ]
 [ 1.92153785  1.40077199  1.41421356 -1.         -0.4472136 ]
 [-0.48038446  0.          1.41421356 -1.         -0.4472136 ]
 [ 0.   

#### 1. MODEL

In [52]:
# 4. Training model Logistic Regression
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# 5. Evaluasi model
accuracy = model.score(X_test_scaled, y_test)
print("Model Accuracy:", accuracy)

# Predict pada data testing
predictions = model.predict(X_test_scaled)
print("Predictions:", predictions)

Model Accuracy: 0.5
Predictions: [0 1]
