In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from keras.callbacks import EarlyStopping
from keras_preprocessing.sequence import pad_sequences

from keras.models import load_model

## Load Data

In [2]:
df = pd.read_csv('D:\kuliah\THE ONLY TA THINGS\DATA\data clean\data_percobaan_5000.csv')
df

Unnamed: 0.1,Unnamed: 0,content,stemming,label
0,0,Boros banget,"['boros', 'banget']",-1
1,1,Kok punya saya tidak bisa di update ke fitur t...,"['baru', 'fitur', 'baru', 'jaring', 'bagus', '...",-1
2,2,Kenapa pas aku pengen cari vidio gak bisa ya? ...,"['pas', 'pengin', 'cari', 'video', 'alami', 'n...",0
3,3,Kenapa suara vidio yg saya upload di hapus pad...,"['suara', 'video', 'unggah', 'hapus', 'ambil',...",-1
4,4,kasih bintang 3 dulu soalnya blom ada tanda live,"['kasih', 'bintang', 'tanda', 'live']",0
...,...,...,...,...
3216,4976,Mantab,['mantap'],1
3217,4984,Aplikasi buruk. Pemecah belah bangsa. Pencuri ...,"['aplikasi', 'buruk', 'pecah', 'belah', 'bangs...",-1
3218,4990,Ih tiktokku yang shopnya kok ga ada si pdhl ud...,"['ih', 'tiktokku', 'shopnya', 'baru', 'banget'...",0
3219,4991,Mantap,['mantap'],1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3221 entries, 0 to 3220
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  3221 non-null   int64 
 1   content     3221 non-null   object
 2   stemming    3221 non-null   object
 3   label       3221 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 100.8+ KB


In [4]:
df['label'].value_counts()

label
 1    1328
-1    1264
 0     629
Name: count, dtype: int64

In [15]:
# Split the data into features and labels
df['clean'] = df['stemming'].astype(str)
review = df['clean'].values

label = pd.get_dummies(df['label'], prefix='label').astype(int)
# label = df['label'].values



### split data test and train

In [6]:
# Split the data into training and testing sets

train_texts, test_texts, train_labels, test_labels = train_test_split(
    review, label, test_size=0.3, random_state=42)

In [7]:
# Extract features using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(train_texts).toarray()
X_test_tfidf = vectorizer.transform(test_texts).toarray()

In [8]:
max_length = max([len(i) for i in review])
max_length

595

In [11]:
print("X_train tfidf shape: ", X_train_tfidf.shape)
print("X_test tfidf shape: ", X_test_tfidf.shape)
print("y_train shape: ", train_labels.shape)
print("y_test shape: ", test_labels.shape)

X_train tfidf shape:  (2254, 2174)
X_test tfidf shape:  (967, 2174)
y_train shape:  (2254, 3)
y_test shape:  (967, 3)


In [None]:
# Pad sequences to have equal length
X_train_padded = pad_sequences(X_train_tfidf, maxlen=100, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_tfidf, maxlen=100, padding='post', truncating='post')


In [12]:
# Define the model
model = Sequential([
    Embedding(10000, 64, input_length=100),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

In [13]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
# Train the model
history = model.fit(X_train_tfidf, train_labels, epochs=10, batch_size=32, validation_data=(X_test_tfidf, test_labels))

Epoch 1/10


ValueError: in user code:

    File "c:\Users\Madluke\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Madluke\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Madluke\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Madluke\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\engine\training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\Madluke\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\Madluke\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 100), found shape=(None, 2174)


In [None]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_padded, test_labels)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
# Predict the labels for the testing data
label_pred_prob = model.predict(X_test_padded)
label_pred = label_pred_prob.argmax(axis=1)
label_test_one_dim = np.argmax(test_labels, axis=1)


print(confusion_matrix(label_test_one_dim, label_pred))
print(classification_report(label_test_one_dim, label_pred))
print("Precision: ",precision_score(label_test_one_dim, label_pred, average="macro"))
print("Recall: ",recall_score(label_test_one_dim, label_pred, average="macro"))
print("F1: ",f1_score(label_test_one_dim, label_pred,  average="macro"))
print("Accuracy: ",accuracy_score(label_test_one_dim, label_pred))

In [None]:
label_pred