In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
df = pd.read_csv("Dataset/dataset_fix.csv")
df

Unnamed: 0,Kalimat #,Word,Tag
0,0,tiap,O
1,0,makan,O
2,0,kesini,O
3,0,ngga,O
4,0,pernah,O
...,...,...,...
50531,859,portions,I-FOOD
50532,859,way,I-FOOD
50533,859,too,I-FOOD
50534,859,small,I-FOOD


In [3]:
df["Word"][1024] = "lokasi"
df["Word"][1025] = "di"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [4]:
dic={}
for i, tag in enumerate(df.Tag.unique()):
    dic[tag] = i
dic

{'O': 0,
 'B-FOOD': 1,
 'I-FOOD': 2,
 'B-MISCELLANEOUS': 3,
 'I-MISCELLANEOUS': 4,
 'B-SERVICE': 5,
 'I-SERVICE': 6,
 'B-AMBIENCE': 7,
 'I-AMBIENCE': 8,
 'B-PRICE': 9,
 'I-PRICE': 10}

In [5]:
labels = df["Tag"].apply(lambda x:dic[x])
labels

0        0
1        0
2        0
3        0
4        0
        ..
50531    2
50532    2
50533    2
50534    2
50535    0
Name: Tag, Length: 50536, dtype: int64

In [6]:
list_kalimat = []
for i in range(df["Kalimat #"].min(),df["Kalimat #"].max()+1):
    list_kata = ["<S>"]
    for kata in df[df["Kalimat #"] == i]["Word"]:
        list_kata.append(str(kata))
    list_kata.append("</S>")
    list_kalimat.append(list_kata)

In [7]:
list_kalimat_join = []
for kalimat in list_kalimat:
    list_kalimat_join.append(" ".join(kalimat))

In [8]:
print(list_kalimat[0])

['<S>', 'tiap', 'makan', 'kesini', 'ngga', 'pernah', 'cuma', '1', 'atau', '2', 'porsi', 'pasti', 'nambah', 'terus', 'karena', 'emang', 'pas', 'banget', 'rasanya', 'di', 'lidah', 'black', 'peppernya', 'yang', 'paling', 'enak', 'disini', 'emang', 'selalu', 'waiting', 'list', 'cuma', 'sei', 'sapi', 'lamalera', 'absolutely', 'worth', 'to', 'wait', 'recommended', '</S>']


In [9]:
df[df["Kalimat #"] == 234]

Unnamed: 0,Kalimat #,Word,Tag
13049,234,tempat,B-AMBIENCE
13050,234,yg,I-AMBIENCE
13051,234,cozy,I-AMBIENCE
13052,234,dengan,O
13053,234,udara,O
13054,234,terbuka,O
13055,234,yg,O
13056,234,paling,O
13057,234,penting,O
13058,234,makanannya,B-FOOD


In [10]:
list_kalimat_join[234]

'<S> tempat yg cozy dengan udara terbuka yg paling penting makanannya enak semuasebagai penggemar sayur menu harganya di sini pas salad porsi kecil dikisaran 2025 rb itu pas banget yg jelas enakpastry yg digulung bersama bayam creamy cruchy sesuatu bangetlove this place bakalan balik utk saladnya mencoba menu lainnya </S>'

In [11]:
trigram = []
for kalimat in (list_kalimat):
    for i in range(len(kalimat)):
        if i > 0 and i < len(kalimat)-1:
            trigram.append([kalimat[i-1], kalimat[i], kalimat[i+1]])

In [12]:
last3 = []
last2 = []
for i in trigram:
    last3.append(i[1][-3:])
    last2.append(i[1][-2:])

In [13]:
NUM_WORDS=10000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(trigram)
sequences_train = tokenizer.texts_to_sequences(trigram)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 7553 unique tokens.


In [14]:
word_index

{'dan': 1,
 'yang': 2,
 'the': 3,
 'enak': 4,
 'di': 5,
 'nya': 6,
 'juga': 7,
 'banget': 8,
 'and': 9,
 'yg': 10,
 'ini': 11,
 'ada': 12,
 'untuk': 13,
 '<s>': 14,
 '</s>': 15,
 'tapi': 16,
 'ga': 17,
 'saya': 18,
 'to': 19,
 'i': 20,
 'is': 21,
 'bandung': 22,
 'dengan': 23,
 'a': 24,
 'makan': 25,
 'buat': 26,
 'sih': 27,
 'ke': 28,
 'karena': 29,
 'rasa': 30,
 'sama': 31,
 'was': 32,
 'tempatnya': 33,
 'dari': 34,
 'tempat': 35,
 'banyak': 36,
 'pas': 37,
 'of': 38,
 'rasanya': 39,
 'tidak': 40,
 'jadi': 41,
 'it': 42,
 'menu': 43,
 'aku': 44,
 'for': 45,
 'bgt': 46,
 'disini': 47,
 'lagi': 48,
 'makanan': 49,
 'agak': 50,
 'bisa': 51,
 'kalo': 52,
 'aja': 53,
 'kesini': 54,
 'with': 55,
 'in': 56,
 'suka': 57,
 'place': 58,
 'but': 59,
 'good': 60,
 'pesen': 61,
 'so': 62,
 'harga': 63,
 'lebih': 64,
 'itu': 65,
 'ya': 66,
 'cukup': 67,
 'terlalu': 68,
 'kalau': 69,
 'lumayan': 70,
 'not': 71,
 'food': 72,
 'kurang': 73,
 'sangat': 74,
 'you': 75,
 'mau': 76,
 'paling': 77,
 'nyam

In [15]:
NUM_WORDS_f=5000
tokenizer_f = Tokenizer(num_words=NUM_WORDS_f,filters='!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer_f.fit_on_texts(last3 + last2)
sequences_last3 = tokenizer_f.texts_to_sequences(last3)
sequences_last2 = tokenizer_f.texts_to_sequences(last2)
word_index_f = tokenizer_f.word_index
print('Found %s unique tokens.' % len(word_index_f))

Found 2577 unique tokens.


In [16]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
toEncode = labels.values.reshape(-1, 1)
enc = enc.fit(toEncode)
enc.get_feature_names()

array(['x0_0', 'x0_1', 'x0_2', 'x0_3', 'x0_4', 'x0_5', 'x0_6', 'x0_7',
       'x0_8', 'x0_9', 'x0_10'], dtype=object)

In [17]:
Encoded = enc.transform(toEncode).toarray()
Encoded

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [18]:
df_trigram = pd.DataFrame(columns=["Trigram", 
                                   "Last 2 Character", 
                                   "Last 3 Character", 
                                   "Label", 
                                   "Trigram Encoded", 
                                   "Last 2 Character Encoded", 
                                   "Last 3 Character Encoded", 
                                   "Label Encoded"], 
                          data=zip(trigram, last2, last3, df["Tag"].values, sequences_train, sequences_last2, sequences_last3, Encoded))
df_trigram

Unnamed: 0,Trigram,Last 2 Character,Last 3 Character,Label,Trigram Encoded,Last 2 Character Encoded,Last 3 Character Encoded,Label Encoded
0,"[<S>, tiap, makan]",ap,iap,O,"[14, 575, 25]",[130],[355],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[tiap, makan, kesini]",an,kan,O,"[575, 25, 54]",[1],[21],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[makan, kesini, ngga]",ni,ini,O,"[25, 54, 1055]",[15],[16],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[kesini, ngga, pernah]",ga,gga,O,"[54, 1055, 173]",[8],[412],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[ngga, pernah, cuma]",ah,nah,O,"[1055, 173, 95]",[11],[248],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...
50531,"[the, portions, way]",ns,ons,I-FOOD,"[3, 1711, 696]",[372],[608],"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50532,"[portions, way, too]",ay,way,I-FOOD,"[1711, 696, 147]",[157],[403],"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50533,"[way, too, small]",oo,too,I-FOOD,"[696, 147, 398]",[269],[328],"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50534,"[too, small, p]",ll,all,I-FOOD,"[147, 398, 3476]",[50],[125],"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [19]:
X_train, X_test, y_train, y_test = train_test_split(df_trigram.index.values, 
                                                    df_trigram["Label Encoded"].values, 
                                                    test_size=0.2, 
                                                    random_state=1301170066, 
                                                    shuffle=True, 
                                                    stratify=labels.values)

In [20]:
X_train_trigram = df_trigram["Trigram Encoded"].iloc[X_train].values
X_test_trigram = df_trigram["Trigram Encoded"].iloc[X_test].values

X_train_feature = list(zip(df_trigram["Last 2 Character Encoded"].iloc[X_train].values, 
                     df_trigram["Last 3 Character Encoded"].iloc[X_train].values))
X_test_feature = list(zip(df_trigram["Last 2 Character Encoded"].iloc[X_test].values, 
                     df_trigram["Last 3 Character Encoded"].iloc[X_test].values))

In [21]:
X_train_feature = [[i[0][0], i[1][0]] for i in X_train_feature]
X_test_feature = [[i[0][0], i[1][0]] for i in X_test_feature]

In [22]:
X_train_trigram = np.array([np.array(x).astype('float32') for x in X_train_trigram])
X_train_feature = np.array([np.array(x).astype('float32') for x in X_train_feature])
y_train = np.array([np.array(x).astype('float32') for x in y_train])

X_test_trigram = np.array([np.array(x).astype('float32') for x in X_test_trigram])
X_test_feature = np.array([np.array(x).astype('float32') for x in X_test_feature])
y_test = np.array([np.array(x).astype('float32') for x in y_test])

In [23]:
print('Shape of X train trigram:', X_train_trigram.shape)
print('Shape of X train features:', X_train_feature.shape)
print('Shape of label train:', y_train.shape)

print('Shape of X test trigram:', X_test_trigram.shape)
print('Shape of X test features:', X_test_feature.shape)
print('Shape of label test:', y_test.shape)

Shape of X train trigram: (40428, 3)
Shape of X train features: (40428, 2)
Shape of label train: (40428, 11)
Shape of X test trigram: (10108, 3)
Shape of X test features: (10108, 2)
Shape of label test: (10108, 11)


In [24]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

idwiki_300 = Word2Vec.load("Model/idwiki_word2vec_300.model")
word_vectors = idwiki_300.wv

EMBEDDING_DIM = 300
vocabulary_size = len(word_index)+1
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer_trigram = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [25]:
idwiki_300 = Word2Vec.load("Model/idwiki_word2vec_300.model")
word_vectors = idwiki_300.wv

EMBEDDING_DIM = 300
vocabulary_size = len(word_index_f)+1
embedding_matrix_f = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index_f.items():
    if i>=NUM_WORDS_f:
        continue
    try:
        embedding_vector_f = word_vectors[word]
        embedding_matrix_f[i] = embedding_vector_f
    except KeyError:
        embedding_matrix_f[i] = np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer_feature = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix_f],
                            trainable=True)

In [26]:
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout, concatenate, LSTM
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
sequence_length_trigram = X_train_trigram.shape[1]
sequence_length_feature = X_train_feature.shape[1]
filter_sizes = [1,2,3]
num_filters = 100
drop = 0.5

inputs_trigram = Input(shape=(sequence_length_trigram,))
embedding_trigram = embedding_layer_trigram(inputs_trigram)
reshape_trigram = Reshape((sequence_length_trigram,EMBEDDING_DIM,1))(embedding_trigram)

conv_0_trigram = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape_trigram)
conv_1_trigram = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape_trigram)
conv_2_trigram = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape_trigram)

maxpool_0_trigram = MaxPooling2D((sequence_length_trigram - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0_trigram)
maxpool_1_trigram = MaxPooling2D((sequence_length_trigram - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1_trigram)
maxpool_2_trigram = MaxPooling2D((sequence_length_trigram - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2_trigram)

inputs_feature = Input(shape=(sequence_length_feature,))
embedding_feature = embedding_layer_feature(inputs_feature)
reshape_feature = Reshape((sequence_length_feature,EMBEDDING_DIM,1))(embedding_feature)
conv_feature = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape_feature)
maxpool_feature = MaxPooling2D((sequence_length_feature - filter_sizes[0] + 1, 1), strides=(1,1))(conv_feature)

merged_tensor = concatenate([maxpool_0_trigram, maxpool_1_trigram, maxpool_2_trigram, maxpool_feature], axis=1)
flatten = Flatten()(merged_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=11, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

# this creates a model that includes
model = Model(inputs=[inputs_trigram, inputs_feature], outputs=output)

In [27]:
adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])
callbacks = [EarlyStopping(monitor='val_loss')]
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 2)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 3, 300)       2266200     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 2, 300)       773400      input_2[0][0]                    
______________________________________________________________________________________________

In [36]:
model.fit(x=[X_train_trigram, X_train_feature], y=y_train, 
          batch_size=100, 
          epochs=100, 
          verbose=1, 
          validation_data=([X_test_trigram, X_test_feature], y_test),
          callbacks=callbacks
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100


<tensorflow.python.keras.callbacks.History at 0x1ff78513d48>

In [37]:
y_pred = model.predict([X_test_trigram, X_test_feature])

In [38]:
y_test_argmax = [np.argmax(i) for i in y_test]
y_pred_argmax = [np.argmax(i) for i in y_pred]

In [44]:
from sklearn.metrics import classification_report, make_scorer, f1_score

print(classification_report(
    y_test_argmax, y_pred_argmax, labels=[1,2,3,4,5,6,7,8,9,10], digits=3))
print("f1 score:", f1_score(y_test_argmax, y_pred_argmax, average='micro'))

              precision    recall  f1-score   support

           1      0.547     0.400     0.462       395
           2      0.642     0.599     0.620      1360
           3      0.455     0.091     0.152       110
           4      0.537     0.201     0.292       399
           5      0.750     0.208     0.326        72
           6      0.681     0.425     0.524       261
           7      0.622     0.295     0.400        78
           8      0.651     0.390     0.487       249
           9      0.882     0.349     0.500        43
          10      0.671     0.449     0.538       127

   micro avg      0.627     0.446     0.522      3094
   macro avg      0.644     0.341     0.430      3094
weighted avg      0.620     0.446     0.505      3094

f1 score: 0.7834388603086664
