In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
df = pd.read_csv("Dataset/dataset_fix.csv")
df

Unnamed: 0,Kalimat #,Word,Tag
0,0,tiap,O
1,0,makan,O
2,0,kesini,O
3,0,ngga,O
4,0,pernah,O
...,...,...,...
50531,859,portions,I-FOOD
50532,859,way,I-FOOD
50533,859,too,I-FOOD
50534,859,small,I-FOOD


In [3]:
dic={}
for i, tag in enumerate(df.Tag.unique()):
    dic[tag] = i
dic

{'O': 0,
 'B-FOOD': 1,
 'I-FOOD': 2,
 'B-MISCELLANEOUS': 3,
 'I-MISCELLANEOUS': 4,
 'B-SERVICE': 5,
 'I-SERVICE': 6,
 'B-AMBIENCE': 7,
 'I-AMBIENCE': 8,
 'B-PRICE': 9,
 'I-PRICE': 10}

In [4]:
labels = df["Tag"].apply(lambda x:dic[x])
labels

0        0
1        0
2        0
3        0
4        0
        ..
50531    2
50532    2
50533    2
50534    2
50535    0
Name: Tag, Length: 50536, dtype: int64

In [5]:
list_kalimat = []
for i in range(df["Kalimat #"].min(),df["Kalimat #"].max()+1):
    list_kata = ["<S>"]
    for kata in df[df["Kalimat #"] == i]["Word"]:
        list_kata.append(str(kata))
    list_kata.append("</S>")
    list_kalimat.append(list_kata)

In [6]:
list_kalimat_join = []
for kalimat in list_kalimat:
    list_kalimat_join.append(" ".join(kalimat))

In [7]:
print(list_kalimat[0])

['<S>', 'tiap', 'makan', 'kesini', 'ngga', 'pernah', 'cuma', '1', 'atau', '2', 'porsi', 'pasti', 'nambah', 'terus', 'karena', 'emang', 'pas', 'banget', 'rasanya', 'di', 'lidah', 'black', 'peppernya', 'yang', 'paling', 'enak', 'disini', 'emang', 'selalu', 'waiting', 'list', 'cuma', 'sei', 'sapi', 'lamalera', 'absolutely', 'worth', 'to', 'wait', 'recommended', '</S>']


In [8]:
list_kalimat_join[0]

'<S> tiap makan kesini ngga pernah cuma 1 atau 2 porsi pasti nambah terus karena emang pas banget rasanya di lidah black peppernya yang paling enak disini emang selalu waiting list cuma sei sapi lamalera absolutely worth to wait recommended </S>'

In [9]:
trigram = []
for kalimat in (list_kalimat):
    for i in range(len(kalimat)):
        if i > 0 and i < len(kalimat)-1:
            trigram.append([kalimat[i-1], kalimat[i], kalimat[i+1]])

In [10]:
trigram[50532], df["Tag"][50532]

(['portions', 'way', 'too'], 'I-FOOD')

In [11]:
NUM_WORDS=10000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(trigram)
sequences_train = tokenizer.texts_to_sequences(trigram)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 7554 unique tokens.


In [12]:
sequences_train[0]

[14, 575, 25]

In [13]:
word_index

{'dan': 1,
 'yang': 2,
 'the': 3,
 'enak': 4,
 'di': 5,
 'nya': 6,
 'juga': 7,
 'banget': 8,
 'and': 9,
 'yg': 10,
 'ini': 11,
 'ada': 12,
 'untuk': 13,
 '<s>': 14,
 '</s>': 15,
 'tapi': 16,
 'ga': 17,
 'saya': 18,
 'to': 19,
 'i': 20,
 'is': 21,
 'bandung': 22,
 'dengan': 23,
 'a': 24,
 'makan': 25,
 'buat': 26,
 'sih': 27,
 'ke': 28,
 'karena': 29,
 'rasa': 30,
 'sama': 31,
 'was': 32,
 'tempatnya': 33,
 'dari': 34,
 'tempat': 35,
 'banyak': 36,
 'pas': 37,
 'of': 38,
 'rasanya': 39,
 'tidak': 40,
 'jadi': 41,
 'it': 42,
 'menu': 43,
 'aku': 44,
 'for': 45,
 'bgt': 46,
 'disini': 47,
 'lagi': 48,
 'makanan': 49,
 'agak': 50,
 'bisa': 51,
 'kalo': 52,
 'aja': 53,
 'kesini': 54,
 'with': 55,
 'in': 56,
 'suka': 57,
 'place': 58,
 'but': 59,
 'good': 60,
 'pesen': 61,
 'so': 62,
 'harga': 63,
 'lebih': 64,
 'itu': 65,
 'ya': 66,
 'cukup': 67,
 'terlalu': 68,
 'kalau': 69,
 'lumayan': 70,
 'not': 71,
 'food': 72,
 'kurang': 73,
 'sangat': 74,
 'you': 75,
 'mau': 76,
 'paling': 77,
 'nyam

In [14]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
toEncode = labels.values.reshape(-1, 1)
enc = enc.fit(toEncode)
enc.get_feature_names()

array(['x0_0', 'x0_1', 'x0_2', 'x0_3', 'x0_4', 'x0_5', 'x0_6', 'x0_7',
       'x0_8', 'x0_9', 'x0_10'], dtype=object)

In [15]:
Encoded = enc.transform(toEncode).toarray()
Encoded

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [16]:
df_trigram = pd.DataFrame(columns=["Trigram", "Label", "Trigram Encoded", "Label Encoded"], data=zip(trigram, df["Tag"].values, sequences_train, Encoded))
df_trigram

Unnamed: 0,Trigram,Label,Trigram Encoded,Label Encoded
0,"[<S>, tiap, makan]",O,"[14, 575, 25]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[tiap, makan, kesini]",O,"[575, 25, 54]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[makan, kesini, ngga]",O,"[25, 54, 1055]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[kesini, ngga, pernah]",O,"[54, 1055, 173]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[ngga, pernah, cuma]",O,"[1055, 173, 95]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
50531,"[the, portions, way]",I-FOOD,"[3, 1711, 696]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50532,"[portions, way, too]",I-FOOD,"[1711, 696, 147]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50533,"[way, too, small]",I-FOOD,"[696, 147, 398]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50534,"[too, small, p]",I-FOOD,"[147, 398, 3477]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [17]:
X_train, X_test, y_train, y_test = train_test_split(df_trigram["Trigram Encoded"].values, df_trigram["Label Encoded"].values, test_size=0.2, random_state=1301170066, shuffle=True, stratify=labels.values)

In [18]:
X_train = np.array([np.array(x).astype('float32') for x in X_train])
y_train = np.array([np.array(x).astype('float32') for x in y_train])

X_test = np.array([np.array(x).astype('float32') for x in X_test])
y_test = np.array([np.array(x).astype('float32') for x in y_test])

In [19]:
print('Shape of X train:', X_train.shape)
print('Shape of label train:', y_train.shape)

print('Shape of X test:', X_test.shape)
print('Shape of label test:', y_test.shape)

Shape of X train: (40428, 3)
Shape of label train: (40428, 11)
Shape of X test: (10108, 3)
Shape of label test: (10108, 11)


In [20]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

idwiki_300 = Word2Vec.load("Model/idwiki_word2vec_300.model")
word_vectors = idwiki_300.wv

EMBEDDING_DIM = 300
vocabulary_size = min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [21]:
X_train.shape

(40428, 3)

In [41]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Dropout, LSTM, AveragePooling1D
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
sequence_length = X_train.shape[1]
filter_sizes = [1,2,3]
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
# reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

lstm = LSTM(num_filters, dropout=0.2, recurrent_dropout=0.2, return_sequences=True, go_backwards=False)(embedding)
AvPool = AveragePooling1D(pool_size=(2), strides=None)(lstm)
flatten = Flatten()(AvPool)
# dropout = Dropout(drop)(flatten)
output = Dense(units=11, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(flatten)

# this creates a model that includes
model = Model(inputs, output)

In [42]:
adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])
callbacks = [EarlyStopping(monitor='val_loss')]
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 3)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 3, 300)            2266500   
_________________________________________________________________
lstm_6 (LSTM)                (None, 3, 100)            160400    
_________________________________________________________________
average_pooling1d_1 (Average (None, 1, 100)            0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 11)                1111      
Total params: 2,428,011
Trainable params: 2,428,011
Non-trainable params: 0
_________________________________________________

In [43]:
model.fit(X_train, y_train, 
          batch_size=100, 
          epochs=100, 
          verbose=1, 
          validation_data=(X_test, y_test),
          callbacks=callbacks
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


<tensorflow.python.keras.callbacks.History at 0x14d0b2f7b88>

In [31]:
y_pred = model.predict(X_test)

In [32]:
y_test_argmax = [np.argmax(i) for i in y_test]
y_pred_argmax = [np.argmax(i) for i in y_pred]

In [33]:
from sklearn.metrics import classification_report, make_scorer, f1_score

print(classification_report(
    y_test_argmax, y_pred_argmax,
    labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
    digits=3))
print("f1 score:", f1_score(y_test_argmax, y_pred_argmax, average='micro'))

              precision    recall  f1-score   support

           1      0.581     0.319     0.412       395
           2      0.641     0.619     0.630      1360
           3      0.579     0.100     0.171       110
           4      0.492     0.393     0.437       399
           5      0.727     0.333     0.457        72
           6      0.665     0.571     0.614       261
           7      0.630     0.372     0.468        78
           8      0.605     0.450     0.516       249
           9      0.875     0.326     0.475        43
          10      0.664     0.559     0.607       127

   micro avg      0.619     0.496     0.551      3094
   macro avg      0.646     0.404     0.479      3094
weighted avg      0.617     0.496     0.539      3094

f1 score: 0.7912544519192719
