In [9]:
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l1
from tensorflow.keras.regularizers import l2
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.models import load_model

from sklearn.model_selection import train_test_split
import pickle

import re

# Import Data

In [18]:
data = pd.read_csv('train_preprocess.tsv.txt', sep='\t',header = None)
df = data.rename(columns={0: 'text', 1: 'label'})
df.head()

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


# CLEANSING DATA

In [21]:
category = pd.get_dummies(df.label)
df_baru = pd.concat([df, category], axis=1)
df_baru = df_baru.drop(columns='label')

df_baru['text'] = df_baru['text'].replace('\n', ' ').str.lower()

def fun(x):
    y = re.sub(r'[^a-zA-Z0-9. ]', '', x)
    return y

df_baru['text_new'] = df_baru['text'].apply(lambda x : fun(x))

df_baru['text_new'] = df_baru['text_new'].replace('   ', ' ')
df_baru['text_new'] = df_baru['text_new'].replace('  ', ' ')



df_baru.head()

Unnamed: 0,text,negative,neutral,positive,text_new
0,warung ini dimiliki oleh pengusaha pabrik tahu...,0,0,1,warung ini dimiliki oleh pengusaha pabrik tahu...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,0,1,0,mohon ulama lurus dan k212 mmbri hujjah partai...
2,lokasi strategis di jalan sumatera bandung . t...,0,0,1,lokasi strategis di jalan sumatera bandung . t...
3,betapa bahagia nya diri ini saat unboxing pake...,0,0,1,betapa bahagia nya diri ini saat unboxing pake...
4,duh . jadi mahasiswa jangan sombong dong . kas...,1,0,0,duh . jadi mahasiswa jangan sombong dong . kas...


# PENAMAAN VARIABEL X DAN Y

In [24]:
x = df_baru['text_new'].values
y = df_baru[['negative', 'neutral', 'positive']].values

In [26]:
tokenizer = Tokenizer(num_words=5000, oov_token='x')
tokenizer.fit_on_texts(x)
sekuens_x = tokenizer.texts_to_sequences(x)
padded_x = pad_sequences(sekuens_x)

In [28]:
pickle.dump(tokenizer, open("feature_New_lstm.sav", "wb"))

# SPLIT TRAIN , VALIDATION AND TEST 

In [31]:
x_train, x_test, y_train, y_test = train_test_split(padded_x, y, test_size=0.2, random_state = 4)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state = 4)

# MODEL LSTM

In [34]:
%%time

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=100000, output_dim=128),
    tf.keras.layers.LSTM(32, dropout=0.2),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('val_categorical_accuracy') > 0.75 ):
            self.model.stop_training = True

callbacks = myCallback()

optimizer = keras.optimizers.Adam(learning_rate=0.001)

model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['categorical_accuracy'])

num_epochs = 1000
fit = model.fit(x_train, 
                y_train, 
                batch_size = 16,
                epochs=num_epochs, 
                validation_data=(x_val, y_val),     
                callbacks = [callbacks]
)

Epoch 1/1000
[1m440/440[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 140ms/step - categorical_accuracy: 0.6027 - loss: 0.8086 - val_categorical_accuracy: 0.8352 - val_loss: 0.4488
CPU times: total: 3min 17s
Wall time: 1min 8s


In [36]:
model.summary()

In [38]:
pickle.dump(model, open("model_lstm.sav", 'wb'))

# ACCURACY

In [43]:
train_loss, train_acc = model.evaluate(x_train, y_train)
val_loss, val_acc = model.evaluate(x_val, y_val)
test_loss, test_acc = model.evaluate(x_test, y_test)

print('\nTrain: %.3f, val: %.3f, test: %.3f,' % (train_acc, val_acc, test_acc))

[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - categorical_accuracy: 0.8463 - loss: 0.4219
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - categorical_accuracy: 0.8262 - loss: 0.4606
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - categorical_accuracy: 0.8289 - loss: 0.4578

Train: 0.855, val: 0.835, test: 0.823,
