In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
df = pd.read_csv("Dataset/dataset_fix.csv")
df

Unnamed: 0,Kalimat #,Word,Tag
0,0,tiap,O
1,0,makan,O
2,0,kesini,O
3,0,ngga,O
4,0,pernah,O
...,...,...,...
50531,859,portions,I-FOOD
50532,859,way,I-FOOD
50533,859,too,I-FOOD
50534,859,small,I-FOOD


In [3]:
dic={}
for i, tag in enumerate(df.Tag.unique()):
    dic[tag] = i
dic

{'O': 0,
 'B-FOOD': 1,
 'I-FOOD': 2,
 'B-MISCELLANEOUS': 3,
 'I-MISCELLANEOUS': 4,
 'B-SERVICE': 5,
 'I-SERVICE': 6,
 'B-AMBIENCE': 7,
 'I-AMBIENCE': 8,
 'B-PRICE': 9,
 'I-PRICE': 10}

In [4]:
# df["Tag Encoded"]= df.Tag.apply(lambda x:dic[x])
# df

In [5]:
list_kalimat = []
for i in range(df["Kalimat #"].min(),df["Kalimat #"].max()+1):
    list_kata = []
    for kata in df[df["Kalimat #"] == i]["Word"]:
        list_kata.append(str(kata))
    list_kalimat.append(list_kata)

In [6]:
list_kalimat_join = []
for kalimat in list_kalimat:
    list_kalimat_join.append(" ".join(kalimat))

In [7]:
idwiki_100 = Word2Vec.load("Model/idwiki_word2vec_100.model")
list_wv = []
for kata in df["Word"]:
    wv_kata =np.zeros(100)
    if(kata in idwiki_100.wv.vocab):
        wv_kata = idwiki_100.wv[kata]
    list_wv.append(wv_kata)

In [8]:
df["wv"] = [i.tolist() for i in list_wv]
df.head()

Unnamed: 0,Kalimat #,Word,Tag,wv
0,0,tiap,O,"[1.7496031522750854, -2.08819317817688, 2.4487..."
1,0,makan,O,"[0.15238453447818756, 0.267755389213562, 1.726..."
2,0,kesini,O,"[0.44860780239105225, 0.24110662937164307, -0...."
3,0,ngga,O,"[0.2567923069000244, 0.2274191975593567, 0.010..."
4,0,pernah,O,"[0.9020400047302246, 2.8760569095611572, 0.432..."


In [9]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
toEncode = df["Tag"].values.reshape(-1, 1)
enc = enc.fit(toEncode)
enc.get_feature_names()

array(['x0_B-AMBIENCE', 'x0_B-FOOD', 'x0_B-MISCELLANEOUS', 'x0_B-PRICE',
       'x0_B-SERVICE', 'x0_I-AMBIENCE', 'x0_I-FOOD', 'x0_I-MISCELLANEOUS',
       'x0_I-PRICE', 'x0_I-SERVICE', 'x0_O'], dtype=object)

In [10]:
Encoded = enc.transform(toEncode).toarray()
Encoded

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [11]:
df["label_encoded"] = [i for i in Encoded]
df.head()

Unnamed: 0,Kalimat #,Word,Tag,wv,label_encoded
0,0,tiap,O,"[1.7496031522750854, -2.08819317817688, 2.4487...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0,makan,O,"[0.15238453447818756, 0.267755389213562, 1.726...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,0,kesini,O,"[0.44860780239105225, 0.24110662937164307, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,0,ngga,O,"[0.2567923069000244, 0.2274191975593567, 0.010...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,0,pernah,O,"[0.9020400047302246, 2.8760569095611572, 0.432...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
len(df["wv"])

50536

In [13]:
X = df["wv"].values
y = df["label_encoded"].values

In [14]:
len(X)

50536

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1301170066)

In [16]:
NUM_WORDS = 10000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True, oov_token='UNKNOWN')
tokenizer.fit_on_texts(list_kalimat_join)
sequences_train = tokenizer.texts_to_sequences(list_kalimat_join) # Ini bingung lagian ga dipake jg
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 7550 unique tokens.


In [17]:
# word_index.items()

In [18]:
X_train = np.array([np.array(x).astype('float32') for x in X_train])
y_train = np.array([np.array(x).astype('float32') for x in y_train])

In [19]:
print('Shape of X train:', X_train.shape)
print('Shape of label train:', y_train.shape)

Shape of X train: (40428, 100)
Shape of label train: (40428, 11)


In [20]:
word_vectors = idwiki_100.wv

EMBEDDING_DIM = 100
vocabulary_size = min(len(word_index)+1,NUM_WORDS)
print("Vocabulary size:", vocabulary_size)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
#         embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)
        embedding_matrix[i]=np.random.normal(0,0,EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

Vocabulary size: 7551


In [21]:
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers

sequence_length = X_train.shape[1]
filter_sizes = [3,4,5]
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length, ))

# embedding = embedding_layer(inputs)

# reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)
reshape = Reshape((sequence_length,1))(inputs)

conv_0 = Conv1D(num_filters, (filter_sizes[0]),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv1D(num_filters, (filter_sizes[1]),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_2 = Conv1D(num_filters, (filter_sizes[2]),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling1D((sequence_length - filter_sizes[0] + 1), strides=(1))(conv_0)
maxpool_1 = MaxPooling1D((sequence_length - filter_sizes[1] + 1), strides=(1))(conv_1)
maxpool_2 = MaxPooling1D((sequence_length - filter_sizes[2] + 1), strides=(1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((11*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
output = Dense(units=11, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

model = Model(inputs, output)

In [22]:
adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])

callbacks = [EarlyStopping(monitor='acc', patience=10)]

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
reshape (Reshape)               (None, 100, 1)       0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 98, 100)      400         reshape[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 97, 100)      500         reshape[0][0]                    
______________________________________________________________________________________________

In [23]:
model.fit(X_train, y_train, batch_size=100, epochs=10, verbose=1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x24e3c7514c8>

In [25]:
X_test = np.array([np.array(x).astype('float32') for x in X_test])
# y_test = np.array([np.array(x).astype('float32') for x in y_test])
y_pred = model.predict(X_test)

In [38]:
y_test[1000]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])

In [39]:
y_pred[1000]

array([0.00810993, 0.04459351, 0.01231202, 0.00523666, 0.00821869,
       0.02503779, 0.12719195, 0.04573906, 0.01367501, 0.02012711,
       0.6897583 ], dtype=float32)