In [46]:
# Import Module
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D

In [47]:
# Load data train
data_train = pd.read_csv("train.csv")

# Replace String "USER" in Text Column 
data_train["text"] = data_train["text"].apply(lambda x: x.replace("USER", ""))

# Load data test
data_test = pd.read_csv("test.csv")
print(data_test)
# Replace String "USER" in Text Column 
data_test["text"] = data_test["text"].apply(lambda x: x.replace("USER", ""))

# Delete Hexadecimal in Text Column
import re
def remove_hexadecimals(text):
    return re.sub(r'\\x[\da-fA-F]+', '', text)
data_test['text'] = data_test['text'].apply(remove_hexadecimals)
data_train['text'] = data_train['text'].apply(remove_hexadecimals)

teks = data_train['text'].astype(str) # Input
y = data_train['label'].astype(int) # Target Model
teks_test = data_test['text'].astype(str) # Predict Input
ID = data_test['id']

         id                                               text
0     11371                 USER USER Nanti orang Hindu marah'
1     12140  USER Pak USER saya ttap mendesak Anda menyatak...
2     11170   BERSIKAP KERASLAH TERHADAP MEREKA?CINA KAFIR URL
3      1265  USER Ganti casing aja ,padahal nyatanya akusis...
4     12098  USER mataku ku sipit sipitin dikit. sekelibet ...
...     ...                                                ...
2629   7390  USER USER USER USER USER USER USER eh dongo, k...
2630   3019  USER USER Kalau saya melihat kok yang kebanget...
2631  13145  USER Goblok, bayangin aja kalo janin itu lu (y...
2632   4603  RT USER USER USER Klop pemberontak sekaligus k...
2633   6116  USER USER Cebong juara satu lari dari kenyataa...

[2634 rows x 2 columns]


In [48]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(teks)
sequences = tokenizer.texts_to_sequences(teks)

# Pad sequences to the same length
max_len = max([len(s.split()) for s in teks])
x = pad_sequences(sequences, maxlen=max_len)

In [49]:
# Build CNN model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_len))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(64, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x, y, epochs=10, batch_size=32, validation_split=0.5)

sequences_test = tokenizer.texts_to_sequences(teks_test)
x_test = pad_sequences(sequences_test, maxlen=max_len)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [50]:
# Prediksi
predictions = model.predict(x_test)
predictions = np.round(predictions).astype(int)
prediction_percentage = sum(predictions == [0, 1]) / len(predictions) * 100
print("Prediksi : ", prediction_percentage)
prediction_percentage = len(predictions) / len(x_test)

data_test['label'] = predictions
print(data_test[['id','text','label']])

Prediksi :  [93.47000759  6.52999241]
         id                                               text  label
0     11371                           Nanti orang Hindu marah'      0
1     12140   Pak  saya ttap mendesak Anda menyatakan Muham...      0
2     11170   BERSIKAP KERASLAH TERHADAP MEREKA?CINA KAFIR URL      0
3      1265   Ganti casing aja ,padahal nyatanya akusisi it...      0
4     12098    mataku ku sipit sipitin dikit. sekelibet mirip'      0
...     ...                                                ...    ...
2629   7390         eh dongo, klo namannya di tackle terus ...      0
2630   3019    Kalau saya melihat kok yang kebangeten itu p...      0
2631  13145   Goblok, bayangin aja kalo janin itu lu (yg ng...      0
2632   4603         RT    Klop pemberontak sekaligus koruptor.      0
2633   6116    Cebong juara satu lari dari kenyataan,kepint...      0

[2634 rows x 3 columns]


In [52]:
data_test.to_excel('result.xlsx', index=False)