**Library**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Import Data**

In [None]:
import csv
import pandas as pd

d = pd.read_csv('/content/drive/My Drive/Data/training_split.csv')
x = d['tweet'].values

d

**Pre Processing**

In [None]:
import json

# load data hasil pre processing
with open('/content/drive/My Drive/Data/data_prepro.json', 'r') as js:
  preprocessing = json.load(js)
print(len(preprocessing))

In [None]:
words = [item for sublist in preprocessing for item in sublist]
print(len(words))
print(words[0])
print(words[0][0])

In [None]:
max_len = 280

**Tokenizer**

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence

tk = Tokenizer()
tk.fit_on_texts(words)
x = tk.texts_to_sequences(words)
x = sequence.pad_sequences(x, maxlen=max_len, padding='post')

# pickle.dump(tk, open('/content/drive/My Drive/Data/tk.pk','wb'))

# file_ = open('/content/drive/My Drive/Data/tk.pk','rb')
# tk = pickle.load(file_)
# file_.close()

max_word = tk.word_index
print(len(max_word))

**Word2Vec**

In [None]:
from gensim.models import Word2Vec, KeyedVectors, FastText
from gensim.test.utils import get_tmpfile

# path = get_tmpfile('/content/drive/My Drive/Data/word2vec_split.model')
# model = Word2Vec(words, size=300)
# model.wv.save_word2vec_format('/content/drive/My Drive/Data/word2vec_split.model', binary=False)

vec = KeyedVectors.load_word2vec_format('/content/drive/My Drive/Data/word2vec_split.model', binary=False)

# vec.wv.vocab

In [None]:
def word_embedding(vocabulary_size,vec_dim,vec,tk):
  matrix_embedding = np.zeros((vocabulary_size,vec_dim))
  embedding_vector = []
  for key,val in tk.word_index.items():
    # jika tidak ada pada kamus diabaikan
    if key not in vec.vocab:
      continue
    embedding_vector = vec[key]
    if embedding_vector is not None:
      matrix_embedding[val]=embedding_vector
    else:
      matrix_embedding[val]=unknown_vector
  return matrix_embedding

In [None]:
import numpy as np

#membuat matrix embedding berisi vector kata
num_words = len(tk.word_counts.keys())+1
embedding_size = len(vec.wv['karena'])
matrix_embedding = word_embedding(num_words,embedding_size,vec,tk)
print(embedding_size)

In [None]:
matrix_embedding.shape
print(num_words)

**One Hot Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras import utils as np_utils

kategori = 'ft'
y = d[kategori]
num_classes = 2

# Label Encoding categorical data for the classification category
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)

# Perform one hot encoding 
y = np_utils.to_categorical(y, num_classes= num_classes)

**Model**

In [None]:
from keras.layers import Input, Dense, Flatten, Dropout, Activation, Embedding, Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.models import Model, Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf

max_features = num_words
maxlen = max_len
embedding_dims = embedding_size

def create_model(i, x_train, x_test, y_train, y_test, kernel, dropout, learning_rate, kategori, ke):
  print('Build model...')
  ke = str(ke)
  i = str(i)

  model_input = Input(shape=(280,))
  model_embedding = Embedding(max_features, embedding_size, input_length=max_len)(model_input)
  model_conv = Conv1D(256, kernel_size=kernel, activation='relu')(model_embedding)
  model_pool = GlobalMaxPooling1D()(model_conv)
  model_dense = Dense(64, activation='relu')(model_pool)
  model_dropout = Dropout(dropout)(model_dense)
  model_output = Dense(2, activation='sigmoid')(model_dropout)

  model = Model(inputs=model_input, outputs=model_output)

  adam = tf.keras.optimizers.Adam(learning_rate=learning_rate)
  model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

  history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=20)
  
  _, train_acc = model.evaluate(x_train, y_train)
  _, test_acc = model.evaluate(x_test, y_test)
  model.save('/content/drive/My Drive/'+kategori+'_'+ke+'_'+i+'.h5')
  return model, train_acc, test_acc

**K-Fold**

In [None]:
# from sklearn.model_selection import KFold
# import pickle
 
# kf = KFold(n_splits=10, random_state=7, shuffle=True)
# kf.get_n_splits(x)
 
# KFold(n_splits=10, random_state=7, shuffle=True)
 
# i = 0
 
# for train_index, test_index in kf.split(x):
#   i = i+1
#   x_train, x_test = x[train_index], x[test_index]
#   y_train, y_test = y[train_index], y[test_index]
#   j = str(i)
#   pickle.dump(x_train, open('/content/drive/My Drive/Fold/x_train_'+kategori+'_'+j+'.pk','wb'))
#   pickle.dump(x_test, open('/content/drive/My Drive/Fold/x_test_'+kategori+'_'+j+'.pk','wb'))
#   pickle.dump(y_train, open('/content/drive/My Drive/Fold/y_train_'+kategori+'_'+j+'.pk','wb'))
#   pickle.dump(y_test, open('/content/drive/My Drive/Fold/y_test_'+kategori+'_'+j+'.pk','wb'))
#   print(j)

In [None]:
import pickle
 
for i in range(0, 10):
  j=i+1
  j = str(j)
  print('Fold '+kategori+' ke - '+j)
 
  x_train = pickle.load(open('/content/drive/My Drive/Fold/x_train_'+kategori+'_'+j+'.pk','rb'))
  print(len(x_train))
  x_test = pickle.load(open('/content/drive/My Drive/Fold/x_test_'+kategori+'_'+j+'.pk','rb'))
  y_train = pickle.load(open('/content/drive/My Drive/Fold/y_train_'+kategori+'_'+j+'.pk','rb'))
  y_test = pickle.load(open('/content/drive/My Drive/Fold/y_test_'+kategori+'_'+j+'.pk', 'rb'))
 
  model, train_acc, test_acc = create_model(i+1, x_train, x_test, y_train, y_test, 3, 0.5, 0.00001, kategori, 4)
  
  print('')
  print('Train: %.4f, Test: %.4f' % (train_acc, test_acc))