In [1]:
import numpy as np
import pandas as pd

# Import the Data

In [2]:
alamat = pd.read_excel("Alamat dan Kelurahan.xlsx")

In [3]:
alamat.head()

Unnamed: 0,jalan,nm_kelurahan
0,RUSUNAWA,MARUNDA
1,~KALIBARU BARAT NO. 22 RT. 005 RW. 005,KALIBARU
2,JL. MALAKA JAYA NO.4 RT. 005 RW. 011,ROROTAN
3,"PD.PASAR JAYA A.L00.ATD.070, RAWABADAK",RAWA BADAK UTARA
4,ASRAMA DKI BLOK B NO. 18 RT. 004 RW. 003,SEMPER BARAT


In [4]:
alamat.describe()

Unnamed: 0,jalan,nm_kelurahan
count,330150,330182
unique,182708,13
top,RUSUNAWA MARUNDA,TUGU UTARA
freq,1103,40129


In [5]:
contoh = 'jl lembayung senja,nomor 27'

In [6]:
import string

nopunc = []

for char in contoh:
    if char not in string.punctuation:
        nopunc.append(char)
    else :
        nopunc.append(' ')
        continue

In [7]:
nopunc = ''.join(nopunc)

In [8]:
nopunc

'jl lembayung senja nomor 27'

In [9]:
nopunc.split()

['jl', 'lembayung', 'senja', 'nomor', '27']

# Text Pre Processing

In [10]:
def clear_text(alamat):
    nopunc = []
    for char in alamat:
        if char not in string.punctuation:
            nopunc.append(char)
        else:
            nopunc.append(' ')
            continue
    nopunc = ''.join(nopunc)
    return nopunc.split()

In [11]:
clear_text(contoh)

['jl', 'lembayung', 'senja', 'nomor', '27']

In [12]:
X = alamat['jalan'].apply(str)

In [13]:
X

0                                                RUSUNAWA
1                  ~KALIBARU BARAT NO. 22 RT. 005 RW. 005
2                    JL. MALAKA JAYA NO.4 RT. 005 RW. 011
3                  PD.PASAR JAYA A.L00.ATD.070, RAWABADAK
4                ASRAMA DKI BLOK B NO. 18 RT. 004 RW. 003
                               ...                       
330177                              JALAN MANTANG GANG IV
330178                                 JL F GG G 1 .NO:32
330179    RUSUN SUKAPURA JL. MANUNGGAL JUANG II LT.3 NO.5
330180                                         ASRAMA DKI
330181                                 KP.BULAK JL.KURNIA
Name: jalan, Length: 330182, dtype: object

In [14]:
y = alamat['nm_kelurahan']

# Train Test Split

In [15]:
from sklearn.model_selection import train_test_split



In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Training the Model

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [18]:
from sklearn.pipeline import Pipeline

kelurahan_model = Pipeline([
    ('bow', CountVectorizer(analyzer=clear_text)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [19]:
kelurahan_model.fit(X_train,y_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function clear_text at 0x000002D3EDD873A0>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [20]:
pred = kelurahan_model.predict(X_test)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix

In [22]:
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

[[3515  110    2    9   32    1   67   33   73   73   51    1   20]
 [  51 5845    1    3    0    1    9    2    6    2   11    0    2]
 [   3    6 2572   25    0   16   93    0   10    1    3   34   56]
 [  40    7   11 5937    2    1   53    0  101    2    7   41  228]
 [  81    3    0    1 2447    2    8    8    7    1    9    1   16]
 [   6    3    5    9    0 1452  534    0    1    1    2  423   89]
 [   5    4   38  134    1   53 5487    1   44    2    5  310  221]
 [  11    2    0    4    5    0    7 4057    6    1   19    0   18]
 [  23    7    8   95    2    1   51    4 7389   37  318    8  179]
 [  23    4    0    6    5    2   26    2  146 3857   36    3   13]
 [  17   10    0    8    9    4   29    9   48    9 7008    4   34]
 [   5    4    8    3    0  101  132    1   18    6   18 3442  178]
 [   1    5    6  111    1   20   76    4   90    1    7  120 7532]]


                    precision    recall  f1-score   support

         CILINCING       0.93      0.88      0.90   

# Terapkan pada SE-53

In [23]:
validasi = pd.read_excel('045_SE-53 KPP KOJA.xlsx',index_col=0)

In [24]:
validasi.head()

Unnamed: 0_level_0,ALAMAT,JALAN,KELURAHAN,KECAMATAN,KOTA,PROVINSI,JALAN.1,KELURAHAN.1
NPWP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
575072129045000,"JL CIPEUCANG IV NO 31, KOJA",JALAN CIPEUCANG,KOJA,,,,JL CIPEUCANG IV,
575100359045000,"KESATRIAN NO 10, CILINCING",,,,,,KESATRIAN,
575124631045000,"LORONG BLOK II G NO 19, KOJA",LORONG BLO,KOJA,,,,LORONG,
575125091045000,"KP PEDONGKELAN NO 0, CILINCING",,,,,,KP PEDONGKELAN,
575125513045000,"JL DELI LORONG NO 4, KOJA",LORONG NO,KOJA,,,,JL DELI LORONG,


In [25]:
validasi['KELURAHAN_FIX'] = kelurahan_model.predict(validasi['ALAMAT'].apply(str))

In [26]:
validasi.head()

Unnamed: 0_level_0,ALAMAT,JALAN,KELURAHAN,KECAMATAN,KOTA,PROVINSI,JALAN.1,KELURAHAN.1,KELURAHAN_FIX
NPWP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
575072129045000,"JL CIPEUCANG IV NO 31, KOJA",JALAN CIPEUCANG,KOJA,,,,JL CIPEUCANG IV,,KOJA
575100359045000,"KESATRIAN NO 10, CILINCING",,,,,,KESATRIAN,,CILINCING
575124631045000,"LORONG BLOK II G NO 19, KOJA",LORONG BLO,KOJA,,,,LORONG,,KOJA
575125091045000,"KP PEDONGKELAN NO 0, CILINCING",,,,,,KP PEDONGKELAN,,CILINCING
575125513045000,"JL DELI LORONG NO 4, KOJA",LORONG NO,KOJA,,,,JL DELI LORONG,,KOJA


In [27]:
validasi.to_csv('SE-53 KPP 045.csv',sep=';')

In [28]:
validasi.to_excel('SE-53 KPP 045.xlsx')

In [29]:
kelurahan_model.predict(['RUSUN SUKAPURA JL. MANUNGGAL JUANG II LT.3 NO.5'])

array(['SUKAPURA'], dtype='<U18')

In [30]:
kelurahan_model.predict(['JL MAWAR VI'])

array(['TUGU UTARA'], dtype='<U18')

In [31]:
len(validasi)

8877

In [32]:
import joblib

In [33]:
joblib.dump(kelurahan_model, 'kelurahan_model.sav')

['kelurahan_model.sav']

In [40]:
load_model = joblib.load('kelurahan_model.sav')

In [37]:
pred2 = load_model.predict(X_test)

In [38]:
print(classification_report(y_test,pred2))

                    precision    recall  f1-score   support

         CILINCING       0.93      0.88      0.90      3987
          KALIBARU       0.97      0.99      0.98      5933
              KOJA       0.97      0.91      0.94      2819
             LAGOA       0.94      0.92      0.93      6430
           MARUNDA       0.98      0.95      0.96      2584
RAWA BADAK SELATAN       0.88      0.58      0.69      2525
  RAWA BADAK UTARA       0.83      0.87      0.85      6305
           ROROTAN       0.98      0.98      0.98      4130
      SEMPER BARAT       0.93      0.91      0.92      8122
      SEMPER TIMUR       0.97      0.94      0.95      4123
          SUKAPURA       0.94      0.97      0.95      7189
      TUGU SELATAN       0.78      0.88      0.83      3916
        TUGU UTARA       0.88      0.94      0.91      7974

          accuracy                           0.92     66037
         macro avg       0.92      0.90      0.91     66037
      weighted avg       0.92      0.9

In [39]:
pred[0]

'SEMPER BARAT'

In [41]:
address = validasi['ALAMAT'].values

In [44]:
preds = kelurahan_model.predict(address)

In [46]:
pd.DataFrame({'Alamat': address, 'Kelurahan':preds})

Unnamed: 0,Alamat,Kelurahan
0,"JL CIPEUCANG IV NO 31, KOJA",KOJA
1,"KESATRIAN NO 10, CILINCING",CILINCING
2,"LORONG BLOK II G NO 19, KOJA",KOJA
3,"KP PEDONGKELAN NO 0, CILINCING",CILINCING
4,"JL DELI LORONG NO 4, KOJA",KOJA
...,...,...
8872,LORONG O PETAK BLOK --- KAV --- NO 26 RT 006 ...,KOJA
8873,JL PEMBANGUNAN III/19 RT 007 RW 009,RAWA BADAK UTARA
8874,JL F GANG K NO 11 RT 009 RW 002,RAWA BADAK UTARA
8875,PLUMPANG B RT 07 RW 05,RAWA BADAK UTARA
