## Import library

In [75]:
import numpy as np
import seaborn as sns
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

## Import Dataset

In [76]:
data_awal = pd.read_csv("Korean_demographics.csv")

In [77]:
data_awal.head(10)

Unnamed: 0,Date,Region,Birth,Birth_rate,Death,Death_rate,Divorce,Divorce_rate,Marriage,Marriage_rate,Natural_growth,Natural_growth_rate
0,1/1/2000,Busan,3752.0,11.61,1875.0,5.8,814.0,2.5,2435.0,7.5,1877.0,5.8
1,1/1/2000,Chungcheongbuk-do,1903.0,15.06,924.0,7.3,220.0,1.7,828.0,6.6,979.0,7.7
2,1/1/2000,Chungcheongnam-do,2398.0,14.75,1466.0,9.0,321.0,2.0,1055.0,6.5,932.0,5.7
3,1/1/2000,Daegu,3057.0,14.39,1117.0,5.3,422.0,2.0,1577.0,7.4,1940.0,9.1
4,1/1/2000,Daejeon,1859.0,16.08,565.0,4.9,280.0,2.4,868.0,7.5,1294.0,11.2
5,1/1/2000,Gangwon-do,1966.0,14.91,1067.0,8.1,304.0,2.3,817.0,6.2,899.0,6.8
6,1/1/2000,Gwangju,2159.0,18.77,606.0,5.3,212.0,1.8,932.0,8.1,1553.0,13.5
7,1/1/2000,Gyeonggi-do,13527.0,17.85,3770.0,5.0,1931.0,2.5,5759.0,7.6,9757.0,12.9
8,1/1/2000,Gyeongsangbuk-do,3362.0,14.14,2230.0,9.4,406.0,1.7,1691.0,7.1,1132.0,4.8
9,1/1/2000,Gyeongsangnam-do,3928.0,15.05,2125.0,8.1,577.0,2.2,2092.0,8.0,1803.0,6.9


In [78]:
data_awal.dtypes

Date                    object
Region                  object
Birth                  float64
Birth_rate             float64
Death                  float64
Death_rate             float64
Divorce                float64
Divorce_rate           float64
Marriage               float64
Marriage_rate          float64
Natural_growth         float64
Natural_growth_rate    float64
dtype: object

In [79]:
data_awal.isna().sum()

Date                     0
Region                   0
Birth                  144
Birth_rate             151
Death                  144
Death_rate             151
Divorce                144
Divorce_rate           151
Marriage               144
Marriage_rate          151
Natural_growth         144
Natural_growth_rate    151
dtype: int64

## Data Cleaning

In [80]:
# median untuk bilangan bulat
imputer = SimpleImputer(strategy='median')
data_awal['Birth'] = imputer.fit_transform(data_awal[['Birth']])
data_awal['Death'] = imputer.fit_transform(data_awal[['Death']])
data_awal['Divorce'] = imputer.fit_transform(data_awal[['Divorce']])
data_awal['Marriage'] = imputer.fit_transform(data_awal[['Marriage']])
data_awal['Natural_growth'] = imputer.fit_transform(data_awal[['Natural_growth']])

In [81]:
# mean untuk bilangan desimal
imputer = SimpleImputer(strategy='mean')
data_awal['Birth_rate'] = imputer.fit_transform(data_awal[['Birth_rate']])
data_awal['Death_rate'] = imputer.fit_transform(data_awal[['Death_rate']])
data_awal['Divorce_rate'] = imputer.fit_transform(data_awal[['Divorce_rate']])
data_awal['Marriage_rate'] = imputer.fit_transform(data_awal[['Marriage_rate']])  
data_awal['Natural_growth_rate'] = imputer.fit_transform(data_awal[['Natural_growth_rate']])  

In [82]:
data_awal.isna().sum()

Date                   0
Region                 0
Birth                  0
Birth_rate             0
Death                  0
Death_rate             0
Divorce                0
Divorce_rate           0
Marriage               0
Marriage_rate          0
Natural_growth         0
Natural_growth_rate    0
dtype: int64

In [83]:
data_awal.head()

Unnamed: 0,Date,Region,Birth,Birth_rate,Death,Death_rate,Divorce,Divorce_rate,Marriage,Marriage_rate,Natural_growth,Natural_growth_rate
0,1/1/2000,Busan,3752.0,11.61,1875.0,5.8,814.0,2.5,2435.0,7.5,1877.0,5.8
1,1/1/2000,Chungcheongbuk-do,1903.0,15.06,924.0,7.3,220.0,1.7,828.0,6.6,979.0,7.7
2,1/1/2000,Chungcheongnam-do,2398.0,14.75,1466.0,9.0,321.0,2.0,1055.0,6.5,932.0,5.7
3,1/1/2000,Daegu,3057.0,14.39,1117.0,5.3,422.0,2.0,1577.0,7.4,1940.0,9.1
4,1/1/2000,Daejeon,1859.0,16.08,565.0,4.9,280.0,2.4,868.0,7.5,1294.0,11.2


In [84]:
data_awal[data_awal.duplicated()]

Unnamed: 0,Date,Region,Birth,Birth_rate,Death,Death_rate,Divorce,Divorce_rate,Marriage,Marriage_rate,Natural_growth,Natural_growth_rate


In [85]:
data_awal.duplicated().sum()

0

## Standarisasi data agar hasil yang akurat

In [86]:
data_standarisasi = data_awal.copy()

In [87]:
np.std(data_standarisasi)

  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


Birth                  8335.745822
Birth_rate                2.320959
Death                  4959.390538
Death_rate                1.563972
Divorce                2256.686006
Divorce_rate              0.478854
Marriage               5706.513865
Marriage_rate             1.435935
Natural_growth         4099.964114
Natural_growth_rate       3.211585
dtype: float64

In [88]:
standard_scaler = StandardScaler()

In [89]:
x_standard = standard_scaler.fit_transform(
    data_standarisasi[[
        'Birth','Birth_rate','Death',
        'Death_rate', 'Divorce','Divorce_rate','Marriage','Marriage_rate',
        'Natural_growth', 'Natural_growth_rate']]
)

In [90]:
np.std(x_standard)

1.0

In [91]:
data_std = pd.DataFrame(x_standard)

In [92]:
data_std.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.036706,1.237475,-0.129069,-0.036783,-0.1305,0.310343,-0.051868,1.317293,0.080326,0.909071
1,-0.258522,2.723929,-0.320827,0.922313,-0.393718,-1.360312,-0.333476,0.690524,-0.1387,1.500679
2,-0.199139,2.590364,-0.211539,2.009289,-0.348962,-0.733817,-0.293697,0.620883,-0.150164,0.877934
3,-0.120082,2.435255,-0.281911,-0.356482,-0.304206,-0.733817,-0.202222,1.247652,0.095692,1.936601
4,-0.2638,3.163403,-0.393215,-0.612241,-0.36713,0.101511,-0.326466,1.317293,-0.06187,2.590484


## Data Splitting

In [126]:
X = data_std[[0,1,2,3,4,5,6,7,8,9]]
Y = data_awal['Region']

In [127]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3 , random_state = 1)

## ALgoritma KNN

In [128]:
KNN = KNeighborsClassifier(n_neighbors = 3)

In [129]:
KNN.fit(X_train, Y_train)

In [130]:
KNN.score(X_train, Y_train)

0.7610229276895943

In [131]:
KNN.score(X_test, Y_test)

0.522633744855967

## Algoritma Naive Bayes

In [132]:
gnb = GaussianNB().fit(X_train, Y_train)

In [133]:
gnb_predict = gnb.predict(X_test)

In [134]:
accuracy_score(gnb_predict, Y_test)

0.696159122085048

## Algoritma Decision Tree

In [135]:
tree_model = DecisionTreeClassifier().fit(X_train,Y_train)

In [136]:
tier = [[1,2,3,4,5,6,7,8,9,0]]
tree_model.predict(tier)

array(['Whole country'], dtype=object)

In [137]:
print("Score Training Set Tree = ", tree_model.score(X_train, Y_train))

Score Training Set Tree =  1.0


In [138]:
print("Score Testing Set Tree = ", tree_model.score(X_test, Y_test))

Score Testing Set Tree =  0.8374485596707819


## Visualisasi menggunakan Tree untuk Regression

In [139]:
plt.figure(figsize=(15,10))
tree_model.plot_tree(tree,
               feature_names = X.columns,
               filled=True,
               class_names = Y.unique())
plt.show()

NameError: name 'modelDTR' is not defined

## Perbandingan Score KNN dengan Score Naive Bayes

In [145]:
print("Hasil Score Algoritma KNN : ")
print("Training set KNN = ",KNN.score(X_train, Y_train))
print("Testing set KNN = ",KNN.score(X_test, Y_test))
print("Hasil Score Algoritma Naive Bayes : ")
print("Prediksi Akurasi Naive Bayes = ",accuracy_score(gnb_predict, Y_test))

Hasil Score Algoritma KNN : 
Training set KNN =  0.7610229276895943
Testing set KNN =  0.522633744855967
Hasil Score Algoritma Naive Bayes : 
Prediksi Akurasi Naive Bayes =  0.696159122085048
