# Import Library

In [None]:
import pandas as pd

import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import classification_report


In [None]:
import warnings
warnings.filterwarnings('ignore')

# Dataset

In [None]:
# Import Dataset from Kaggle with API

# Install Kaggle library
!pip -q install kaggle

# Upload the Kaggle API JSON file
from google.colab import files
files.upload()

# Move the uploaded JSON file to the required directory and set permissions
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset from Kaggle
!kaggle datasets download -d muhtarom/stunting-dataset

# Unzip the downloaded dataset
!unzip -q 'stunting-dataset.zip'

Saving kaggle.json to kaggle.json
Downloading stunting-dataset.zip to /content
  0% 0.00/44.4k [00:00<?, ?B/s]
100% 44.4k/44.4k [00:00<00:00, 60.5MB/s]


In [None]:
df = pd.read_csv('/content/Stunting_Dataset.csv')
df.head()

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting
0,Male,17,3.0,49,10.0,72.2,No,No
1,Female,11,2.9,49,2.9,65.0,No,Yes
2,Male,16,2.9,49,8.5,72.2,No,Yes
3,Male,31,2.8,49,6.4,63.0,No,Yes
4,Male,15,3.1,49,10.5,49.0,No,Yes


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Gender         10000 non-null  object 
 1   Age            10000 non-null  int64  
 2   Birth Weight   10000 non-null  float64
 3   Birth Length   10000 non-null  int64  
 4   Body Weight    10000 non-null  float64
 5   Body Length    10000 non-null  float64
 6   Breastfeeding  10000 non-null  object 
 7   Stunting       10000 non-null  object 
dtypes: float64(3), int64(2), object(3)
memory usage: 625.1+ KB


Terdapat 7 feature:
- Gender
- Age
- Birth Weight (Kilogram)
- Birth Height (Centimeter)
- Body Weight
- Body Height
- Breastfeeding

dan satu label, yakni Stunting:

- Yes = bayi positif stunting
- No  = bayi negatif stunting

# Data Preparation

In [None]:
stunting_counts = df["Stunting"].value_counts().reset_index()
stunting_counts.columns = ['Stunting', 'Count']

fig = px.pie(stunting_counts, values='Count', names='Stunting', title='How Much Stunting from dataset')

fig.show()

In [None]:
# Convert kategorikal menjadi numeric

df['Gender'] = df["Gender"].map({"Male": 1, "Female": 0})
df['Breastfeeding'] = df["Breastfeeding"].map({"Yes": 1, "No": 0})
df['Stunting'] = df["Stunting"].map({"Yes": 1, "No": 0})

In [None]:
df.head()

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting
0,1,17,3.0,49,10.0,72.2,0,0
1,0,11,2.9,49,2.9,65.0,0,1
2,1,16,2.9,49,8.5,72.2,0,1
3,1,31,2.8,49,6.4,63.0,0,1
4,1,15,3.1,49,10.5,49.0,0,1


In [None]:
# Definisi data X Y
X = df.drop(['Stunting', 'Gender'], axis=1)
Y = df['Stunting']

In [None]:
smote = SMOTE(random_state=42)
X, Y = smote.fit_resample(X, Y)

In [None]:
stunting_counts = Y.value_counts().reset_index()
stunting_counts.columns = ['Stunting', 'Count']

fig = px.pie(stunting_counts, values='Count', names='Stunting', title='How Much Stunting from dataset')

fig.show()

In [None]:
print(X, '\n')
print(Y, '\n')

       Age  Birth Weight  Birth Length  Body Weight  Body Length  \
0       17      3.000000            49         10.0    72.200000   
1       11      2.900000            49          2.9    65.000000   
2       16      2.900000            49          8.5    72.200000   
3       31      2.800000            49          6.4    63.000000   
4       15      3.100000            49         10.5    49.000000   
...    ...           ...           ...          ...          ...   
15905   10      2.871481            50          2.9    75.287027   
15906    6      2.300000            50          7.7    68.300000   
15907   10      2.300000            50          6.4    68.300000   
15908   12      2.300000            50          6.4    68.300000   
15909    6      2.300000            50          6.4    68.300000   

       Breastfeeding  
0                  0  
1                  0  
2                  0  
3                  0  
4                  0  
...              ...  
15905              0  

In [None]:
#scaler = StandardScaler()
#scaler = scaler.fit(X)
#X = scaler.transform(X)

# X yang sudah dinormalisasi
#print(X)

In [None]:
scaler = MinMaxScaler()
scaler = scaler.fit(X)
X = scaler.transform(X)

# X yang sudah dinormalisasi
print(X)

[[0.26190476 0.90909091 0.5        0.93421053 0.53089245 0.        ]
 [0.11904762 0.81818182 0.5        0.         0.36613272 0.        ]
 [0.23809524 0.81818182 0.5        0.73684211 0.53089245 0.        ]
 ...
 [0.0952381  0.27272727 1.         0.46052632 0.4416476  0.        ]
 [0.14285714 0.27272727 1.         0.46052632 0.4416476  0.        ]
 [0.         0.27272727 1.         0.46052632 0.4416476  0.        ]]


In [None]:
# Splitting data

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)

# Modelling

In [None]:
# Inisialisasi model k-NN
knn = KNeighborsClassifier()

# Definisikan grid dari hyperparameter yang ingin diuji
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'p': [1, 2],  # 1 untuk jarak Manhattan, 2 untuk jarak Euclidean
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Inisialisasi GridSearchCV dengan model k-NN
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy')

# Melakukan grid search pada data training
grid_search.fit(X_train, y_train)

# Mendapatkan hyperparameter terbaik
best_params = grid_search.best_params_
print("Hyperparameter terbaik:", best_params)

# Menggunakan model dengan hyperparameter terbaik untuk melakukan prediksi pada data test
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train, y_train)
y_pred = best_knn.predict(X_test)

# Evaluasi performa model menggunakan accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Hyperparameter terbaik: {'algorithm': 'ball_tree', 'leaf_size': 20, 'metric': 'manhattan', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
Accuracy: 0.8340666247642992


In [None]:
# Train Model
model_KNN = KNeighborsClassifier(algorithm= 'ball_tree', leaf_size = 20, metric= 'manhattan', n_neighbors= 3, p = 1, weights= 'distance')
model_KNN.fit(X_train, y_train)

# Evaluation

In [None]:
# Hasil prediksi training set
y_pred_train = model_KNN.predict(X_train)

# Hasil prediksi test set
y_pred = model_KNN.predict(X_test)

# Train set score
print('Train Accuracy', metrics.accuracy_score(y_train, y_pred_train))
print('Train Precission', metrics.precision_score(y_train, y_pred_train))
print('Train Recall', metrics.recall_score(y_train, y_pred_train),'\n')

# Test set score
print('Test Accuracy', metrics.accuracy_score(y_test, y_pred))
print('Test Precission', metrics.precision_score(y_test, y_pred))
print('Test Recall', metrics.recall_score(y_test, y_pred),'\n')


Train Accuracy 0.9643530573763132
Train Precission 0.9756641278097715
Train Recall 0.9517941283073578 

Test Accuracy 0.8340666247642992
Test Precission 0.835030549898167
Test Recall 0.8411981945014362 



In [None]:
nama_label = ['Tidak Stunting', 'Stunting']

print('Classification Report Model dengan Tuning Hyperparameter :\n')
print(classification_report(y_test, y_pred, target_names=nama_label))

Classification Report Model dengan Tuning Hyperparameter :

                precision    recall  f1-score   support

Tidak Stunting       0.83      0.83      0.83      2336
      Stunting       0.84      0.84      0.84      2437

      accuracy                           0.83      4773
     macro avg       0.83      0.83      0.83      4773
  weighted avg       0.83      0.83      0.83      4773



# Test Input Data Baru

In [None]:
# Prediksi data baru

# Input data baru
Gender = float(1)
Age = float(17)
Birth_Weight = float(3)
Birth_Length = float(49)
Body_Weight = float(10)
Body_Length = float(72.2)
Breastfeeding = float(0)
data_baru = [[Age, Birth_Weight, Birth_Length, Body_Weight, Body_Length, Breastfeeding]]

scaler = scaler.fit(X_train)
data_baru  = scaler.transform(data_baru)

print(data_baru)

# Memprediksi data baru
hasil_prediksi = int(model_KNN.predict(data_baru))

#Cetak hasil prediksi

print(hasil_prediksi)
if hasil_prediksi == 1:
  print('Stunting')
else:
  print('Tidak Stunting')

[[17.   3.  49.  10.  72.2  0. ]]
1
Stunting


In [None]:
df.head()

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting
0,1,17,3.0,49,10.0,72.2,0,0
1,0,11,2.9,49,2.9,65.0,0,1
2,1,16,2.9,49,8.5,72.2,0,1
3,1,31,2.8,49,6.4,63.0,0,1
4,1,15,3.1,49,10.5,49.0,0,1


In [None]:
import joblib

# Simpan model ke file
joblib.dump(model_KNN, 'model_knn.pkl')

['model_knn.pkl']