In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

import gensim
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv("Dataset/dataset_fix.csv")
df

Unnamed: 0,Kalimat #,Word,Tag
0,0,tiap,O
1,0,makan,O
2,0,kesini,O
3,0,ngga,O
4,0,pernah,O
...,...,...,...
50531,859,portions,I-FOOD
50532,859,way,I-FOOD
50533,859,too,I-FOOD
50534,859,small,I-FOOD


In [3]:
dic={}
for i, tag in enumerate(df.Tag.unique()):
    dic[tag] = i
dic

{'O': 0,
 'B-FOOD': 1,
 'I-FOOD': 2,
 'B-MISCELLANEOUS': 3,
 'I-MISCELLANEOUS': 4,
 'B-SERVICE': 5,
 'I-SERVICE': 6,
 'B-AMBIENCE': 7,
 'I-AMBIENCE': 8,
 'B-PRICE': 9,
 'I-PRICE': 10}

In [4]:
labels = df["Tag"].apply(lambda x:dic[x])
labels

0        0
1        0
2        0
3        0
4        0
        ..
50531    2
50532    2
50533    2
50534    2
50535    0
Name: Tag, Length: 50536, dtype: int64

In [5]:
list_kalimat = []
for i in range(df["Kalimat #"].min(),df["Kalimat #"].max()+1):
    list_kata = ["<S>"]
    for kata in df[df["Kalimat #"] == i]["Word"]:
        list_kata.append(str(kata))
    list_kata.append("</S>")
    list_kalimat.append(list_kata)

In [6]:
list_kalimat[0]

['<S>',
 'tiap',
 'makan',
 'kesini',
 'ngga',
 'pernah',
 'cuma',
 '1',
 'atau',
 '2',
 'porsi',
 'pasti',
 'nambah',
 'terus',
 'karena',
 'emang',
 'pas',
 'banget',
 'rasanya',
 'di',
 'lidah',
 'black',
 'peppernya',
 'yang',
 'paling',
 'enak',
 'disini',
 'emang',
 'selalu',
 'waiting',
 'list',
 'cuma',
 'sei',
 'sapi',
 'lamalera',
 'absolutely',
 'worth',
 'to',
 'wait',
 'recommended',
 '</S>']

In [7]:
list_kalimat_join = []
for kalimat in list_kalimat:
    list_kalimat_join.append(" ".join(kalimat))

In [8]:
list_kalimat_join[0]

'<S> tiap makan kesini ngga pernah cuma 1 atau 2 porsi pasti nambah terus karena emang pas banget rasanya di lidah black peppernya yang paling enak disini emang selalu waiting list cuma sei sapi lamalera absolutely worth to wait recommended </S>'

In [9]:
trigram = []
for kalimat in (list_kalimat):
    for i in range(len(kalimat)):
        if i > 0 and i < len(kalimat)-1:
            trigram.append([kalimat[i-1], kalimat[i], kalimat[i+1]])

In [10]:
idwiki_300 = Word2Vec.load("Model/idwiki_word2vec_300.model")

In [11]:
print("Corpus Count before Update =", idwiki_300.corpus_count)
print("Vocab before Update =", len(idwiki_300.wv.vocab))

Corpus Count before Update = 348902
Vocab before Update = 331792


In [12]:
idwiki_300.build_vocab(list_kalimat, update=True)
idwiki_300.train(list_kalimat, total_examples=idwiki_300.corpus_count, epochs=10)

(415872, 522560)

In [13]:
print("Corpus Count after Update =", idwiki_300.corpus_count)
print("Vocab after Update =", len(idwiki_300.wv.vocab))

Corpus Count after Update = 860
Vocab after Update = 331916


In [14]:
oov_dict = {}
X = []
OOV = 0
for i in range(len(trigram)):
    wv_trigram = []
    for j in range(len(trigram[i])):
        try:
            wv_trigram = wv_trigram + list(idwiki_300.wv[trigram[i][j]])
        except KeyError:
            OOV = OOV + 1
            if trigram[i][j] not in oov_dict.keys():
                oov_dict[trigram[i][j]] = np.random.normal(0,np.sqrt(0.25),300)
            wv_trigram = wv_trigram + list(oov_dict[trigram[i][j]])
    X.append(wv_trigram)

In [15]:
OOV

8672

In [16]:
X = np.array([np.array(x).astype('float32') for x in X])

In [17]:
X.shape

(50536, 900)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=1301170066, shuffle=True, stratify=labels.values)

In [19]:
X_train = np.array([np.array(x).astype('float32') for x in X_train])
y_train = np.array([np.array(x).astype('float32') for x in y_train])

X_test = np.array([np.array(x).astype('float32') for x in X_test])
y_test = np.array([np.array(x).astype('float32') for x in y_test])

In [20]:
print('Shape of X train:', X_train.shape)
print('Shape of label train:', y_train.shape)

print('Shape of X test:', X_test.shape)
print('Shape of label test:', y_test.shape)

Shape of X train: (40428, 900)
Shape of label train: (40428,)
Shape of X test: (10108, 900)
Shape of label test: (10108,)


In [21]:
svclassifier = SVC()
# ovr = OneVsRestClassifier(svclassifier)
# random_grid = {'C': [.0001, .001, .01],
#                   'kernel': ['linear', 'rbf', 'poly'],
#                   'gamma': [.0001, .001, .01, .1, 1, 10, 100],
#                   'degree': [1, 2, 3, 4, 5],
#                   'probability': [True]
#                  }
# random_search = RandomizedSearchCV(estimator=svclassifier,
#                                    param_distributions=random_grid,
#                                    n_iter=50,
#                                    scoring='accuracy',
#                                    cv=3, 
#                                    verbose=1, 
#                                    random_state=8)
# random_search.fit(X_train, y_train)
svclassifier.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [None]:
y_pred = svclassifier.predict(X_test)

In [22]:
from sklearn.metrics import classification_report, make_scorer, f1_score
print(classification_report(
    y_test, y_pred, labels=[1.0 ,2.0 ,3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], digits=3))
print("f1 score:", f1_score(y_test, y_pred, average='micro'))

              precision    recall  f1-score   support

         1.0      0.744     0.081     0.146       395
         2.0      0.636     0.296     0.404      1360
         3.0      0.333     0.009     0.018       110
         4.0      0.500     0.035     0.066       399
         5.0      0.750     0.208     0.326        72
         6.0      0.769     0.115     0.200       261
         7.0      0.533     0.205     0.296        78
         8.0      0.613     0.153     0.244       249
         9.0      0.720     0.419     0.529        43
        10.0      0.714     0.236     0.355       127

   micro avg      0.645     0.193     0.297      3094
   macro avg      0.631     0.176     0.258      3094
weighted avg      0.635     0.193     0.279      3094

f1 score: 0.7370399683419074
