In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
df = pd.read_csv("Dataset/dataset_fix.csv")
df

Unnamed: 0,Kalimat #,Word,Tag
0,0,tiap,O
1,0,makan,O
2,0,kesini,O
3,0,ngga,O
4,0,pernah,O
...,...,...,...
50531,859,portions,I-FOOD
50532,859,way,I-FOOD
50533,859,too,I-FOOD
50534,859,small,I-FOOD


In [3]:
df["Tag"].unique()

array(['O', 'B-FOOD', 'I-FOOD', 'B-MISCELLANEOUS', 'I-MISCELLANEOUS',
       'B-SERVICE', 'I-SERVICE', 'B-AMBIENCE', 'I-AMBIENCE', 'B-PRICE',
       'I-PRICE'], dtype=object)

In [4]:
df.groupby("Tag", sort=False).count()

Unnamed: 0_level_0,Kalimat #,Word
Tag,Unnamed: 1_level_1,Unnamed: 2_level_1
O,35068,35065
B-FOOD,1973,1973
I-FOOD,6799,6799
B-MISCELLANEOUS,551,551
I-MISCELLANEOUS,1992,1992
B-SERVICE,358,358
I-SERVICE,1306,1306
B-AMBIENCE,392,392
I-AMBIENCE,1247,1247
B-PRICE,217,217


In [5]:
dic={}
for i, tag in enumerate(df.Tag.unique()):
    dic[tag] = i
dic

{'O': 0,
 'B-FOOD': 1,
 'I-FOOD': 2,
 'B-MISCELLANEOUS': 3,
 'I-MISCELLANEOUS': 4,
 'B-SERVICE': 5,
 'I-SERVICE': 6,
 'B-AMBIENCE': 7,
 'I-AMBIENCE': 8,
 'B-PRICE': 9,
 'I-PRICE': 10}

In [6]:
labels = df["Tag"].apply(lambda x:dic[x])
labels

0        0
1        0
2        0
3        0
4        0
        ..
50531    2
50532    2
50533    2
50534    2
50535    0
Name: Tag, Length: 50536, dtype: int64

In [7]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()
list_kalimat = []

for i in range(df["Kalimat #"].min(),df["Kalimat #"].max()+1):
    list_kata = ["<S>"]
    for kata in df[df["Kalimat #"] == i]["Word"]:
        list_kata.append(stemmer.stem(str(kata)))
    list_kata.append("</S>")
    list_kalimat.append(list_kata)

In [29]:
stemmer.stem("enakkkkkk")

'enakkkkkk'

In [9]:
list_kalimat_join = []
for kalimat in list_kalimat:
    list_kalimat_join.append(" ".join(kalimat))

In [10]:
print(list_kalimat[0])

['<S>', 'tiap', 'makan', 'kesini', 'ngga', 'pernah', 'cuma', '1', 'atau', '2', 'porsi', 'pasti', 'nambah', 'terus', 'karena', 'emang', 'pas', 'banget', 'rasa', 'di', 'lidah', 'black', 'peppernya', 'yang', 'paling', 'enak', 'sini', 'emang', 'selalu', 'waiting', 'list', 'cuma', 'sei', 'sapi', 'lamalera', 'absolutely', 'worth', 'to', 'wait', 'recommended', '</S>']


In [11]:
list_kalimat_join[0]

'<S> tiap makan kesini ngga pernah cuma 1 atau 2 porsi pasti nambah terus karena emang pas banget rasa di lidah black peppernya yang paling enak sini emang selalu waiting list cuma sei sapi lamalera absolutely worth to wait recommended </S>'

In [12]:
trigram = []
for kalimat in (list_kalimat):
    for i in range(len(kalimat)):
        if i > 0 and i < len(kalimat)-1:
            trigram.append([kalimat[i-1], kalimat[i], kalimat[i+1]])

In [13]:
NUM_WORDS=10000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(trigram)
sequences_train = tokenizer.texts_to_sequences(trigram)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 6527 unique tokens.


In [14]:
# word_index

In [15]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
toEncode = labels.values.reshape(-1, 1)
enc = enc.fit(toEncode)
enc.get_feature_names()

array(['x0_0', 'x0_1', 'x0_2', 'x0_3', 'x0_4', 'x0_5', 'x0_6', 'x0_7',
       'x0_8', 'x0_9', 'x0_10'], dtype=object)

In [16]:
Encoded = enc.transform(toEncode).toarray()
Encoded

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [17]:
df_trigram = pd.DataFrame(columns=["Trigram", "Label", "Trigram Encoded", "Label Encoded"], data=zip(trigram, df["Tag"].values, sequences_train, Encoded))
df_trigram

Unnamed: 0,Trigram,Label,Trigram Encoded,Label Encoded
0,"[<S>, tiap, makan]",O,"[17, 251, 7]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[tiap, makan, kesini]",O,"[251, 7, 57]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[makan, kesini, ngga]",O,"[7, 57, 1026]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[kesini, ngga, pernah]",O,"[57, 1026, 181]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[ngga, pernah, cuma]",O,"[1026, 181, 100]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
50531,"[the, portions, way]",I-FOOD,"[4, 1610, 710]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50532,"[portions, way, too]",I-FOOD,"[1610, 710, 153]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50533,"[way, too, small]",I-FOOD,"[710, 153, 426]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50534,"[too, small, p]",I-FOOD,"[153, 426, 3125]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [18]:
X_train, X_test, y_train, y_test = train_test_split(df_trigram["Trigram Encoded"].values, df_trigram["Label Encoded"].values, test_size=0.2, random_state=1301170066, shuffle=True, stratify=labels.values)

In [19]:
X_train = np.array([np.array(x).astype('float32') for x in X_train])
y_train = np.array([np.array(x).astype('float32') for x in y_train])

X_test = np.array([np.array(x).astype('float32') for x in X_test])
y_test = np.array([np.array(x).astype('float32') for x in y_test])

In [20]:
print('Shape of X train:', X_train.shape)
print('Shape of label train:', y_train.shape)

print('Shape of X test:', X_test.shape)
print('Shape of label test:', y_test.shape)

Shape of X train: (40428, 3)
Shape of label train: (40428, 11)
Shape of X test: (10108, 3)
Shape of label test: (10108, 11)


In [21]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

idwiki_300 = Word2Vec.load("Model/idwiki_word2vec_300.model")
word_vectors = idwiki_300.wv

EMBEDDING_DIM = 300
vocabulary_size = min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [22]:
X_train.shape

(40428, 3)

In [23]:
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout, concatenate, LSTM
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
sequence_length = X_train.shape[1]
filter_sizes = [1,2,3]
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=11, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

# this creates a model that includes
model = Model(inputs, output)

In [24]:
adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 3, 300)       1958400     input_1[0][0]                    
__________________________________________________________________________________________________
reshape (Reshape)               (None, 3, 300, 1)    0           embedding[0][0]                  
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 3, 1, 100)    30100       reshape[0][0]                    
______________________________________________________________________________________________

In [25]:
callbacks = [EarlyStopping(monitor='val_loss')]
model.fit(X_train, y_train, 
          batch_size=100, 
          epochs=100, 
          verbose=1, 
          validation_data=(X_test, y_test),
          callbacks=callbacks
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


<tensorflow.python.keras.callbacks.History at 0x2ba1ffc0d48>

In [26]:
y_pred = model.predict(X_test)

In [27]:
y_test_argmax = [np.argmax(i) for i in y_test]
y_pred_argmax = [np.argmax(i) for i in y_pred]

In [28]:
from sklearn.metrics import classification_report, make_scorer, f1_score

print(classification_report(
    y_test_argmax, y_pred_argmax, labels=[1,2,3,4,5,6,7,8,9,10], digits=3))
print("f1 score:", f1_score(y_test_argmax, y_pred_argmax, average='micro'))

              precision    recall  f1-score   support

           1      0.557     0.334     0.418       395
           2      0.627     0.520     0.569      1360
           3      0.600     0.027     0.052       110
           4      0.545     0.195     0.288       399
           5      0.660     0.431     0.521        72
           6      0.695     0.410     0.516       261
           7      0.739     0.218     0.337        78
           8      0.593     0.434     0.501       249
           9      0.793     0.535     0.639        43
          10      0.746     0.346     0.473       127

   micro avg      0.623     0.404     0.490      3094
   macro avg      0.656     0.345     0.431      3094
weighted avg      0.621     0.404     0.475      3094

f1 score: 0.7766125840918084
