# Deep Learning for Text Classification using Word2Vec

## Import Libraries

In [10]:
import pandas as pd
import re
import nltk
import numpy as np
import gensim
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from tensorflow.python.keras import models, layers, backend as K

## Import Data

In [11]:
train = pd.read_csv("../data_worthcheck/train.csv")
test = pd.read_csv("../data_worthcheck/test.csv")
dev = pd.read_csv("../data_worthcheck/dev.csv")

X_dev = dev['text_a'].values
y_dev = dev['label'].values

X_train = train['text_a'].values
y_train = train['label'].values

X_test = test['text_a'].values
y_test = test['label'].values

## Functions

In [12]:
def clean_data(text):
    normal_tw = text
    normal_tw = text.lower()
    normal_tw = re.sub(r'\\x.{2}', '', normal_tw)
    normal_tw = re.sub(r'((www\.[^\s]*)|(https?://[^\s]*))', '', normal_tw)
    normal_tw = normal_tw.strip()
    normal_tw = re.sub(r'@[^\s]+', '', normal_tw)
    normal_tw = re.sub(r'#[^\s]+', '', normal_tw)
    normal_tw = re.sub(r'\d+', ' ', normal_tw) 
    normal_tw = re.sub(r'^nan$', '', normal_tw) 
    normal_tw = re.sub(r'[_]+', '', normal_tw)
    normal_tw =  re.sub(r'[Ã°Âã¯¹¢²ðƒâ]', '', normal_tw) 
    normal_regex = re.compile(r"(.)\1{1,}")
    normal_tw = normal_regex.sub(r"\1\1", normal_tw)
    normal_tw = re.sub(r'\s+', ' ', normal_tw)
    normal_tw = re.sub(r'[^\w\s]', '', normal_tw) 
    normal_tw = re.sub(r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', '', normal_tw)
    
    return normal_tw

stopwords_list = set(StopWordRemoverFactory().get_stop_words())

def tokenize_text(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokenized_text):
    # token = nltk.word_tokenize(text)
    token_afterremoval = []
    for k in tokenized_text:
        if k not in stopwords_list: 
            token_afterremoval.append(k)

    str_clean = ' '.join(token_afterremoval)
    return token_afterremoval

def preprocess(text):
    processed_text = clean_data(text)
    # processed_text = processed_text.lower()
    processed_text = tokenize_text(processed_text)
    processed_text = remove_stopwords(processed_text)
    return processed_text

def find_max_length(data):
    max = len(data[0])
    index = 0
    for i in range(len(data)):
        if max < len(data[i]):
            max = len(data[i])
            
    return max

def pad_sequence(arr, max_length):
    for i in range(len(arr)):
        arr[i] = np.asarray(arr[i])
        for k in range(len(arr[i])):
            arr[i][k] = float(arr[i][k])
        for j in range(len(arr[i]), max_length):
            # arr[i].append(0.0)
            arr[i] = np.append(arr[i], 0.0)
            # np.concatenate(arr[i], np.array([0]))
    return arr

## Preprocessing

In [13]:
for i in range(len(X_dev)):
    X_dev[i] = preprocess(X_dev[i])

for j in range(len(X_train)):
    X_train[j] = preprocess(X_train[j])

for k in range(len(X_test)):
    X_test[k] = preprocess(X_test[k])

## Features Extraction

In [14]:
MAX_COLUMN_LENGTH = find_max_length(X_train)
MAX_COLUMN_LENGTH = 100
VECTOR_SIZE = 100

In [15]:
w2c = gensim.models.word2vec.Word2Vec(sentences=X_train, min_count=1, vector_size=VECTOR_SIZE)

for i in range(len(X_train)):
    for j in range(len(X_train[i])):
        X_train[i][j] = w2c.wv.key_to_index[X_train[i][j]]

X_train = pad_sequence(X_train, MAX_COLUMN_LENGTH)

dic_vocab = w2c.wv.key_to_index

embeddings = np.zeros((len(dic_vocab) + 1, VECTOR_SIZE))
# print(dic_vocab.items())
for word, idx in dic_vocab.items():
    # print(w2c.wv[word])
    try:
        embeddings[idx] =  w2c.wv[word]
    # if word not in model then skip and the row stays all 0s
    except:
        pass

{'corona': 0,
 'nya': 1,
 't': 2,
 'co': 3,
 'https': 4,
 'yg': 5,
 'covid': 6,
 'virus': 7,
 'depok': 8,
 'psbb': 9,
 'distancing': 10,
 'orang': 11,
 'ga': 12,
 'indonesia': 13,
 'aja': 14,
 'presiden': 15,
 'physical': 16,
 'menkes': 17,
 'normal': 18,
 'gak': 19,
 'jakarta': 20,
 'masker': 21,
 'new': 22,
 'kalo': 23,
 'pemerintah': 24,
 'udah': 25,
 'jokowi': 26,
 'ku': 27,
 'positif': 28,
 'warga': 29,
 'kena': 30,
 'menteri': 31,
 'kota': 32,
 'rumah': 33,
 'kesehatan': 34,
 'masyarakat': 35,
 'sih': 36,
 'gy': 37,
 'amp': 38,
 'masuk': 39,
 'negara': 40,
 'gubernur': 41,
 'semoga': 42,
 'tp': 43,
 'gue': 44,
 'penyebaran': 45,
 'pasien': 46,
 'rakyat': 47,
 'org': 48,
 'dr': 49,
 'pake': 50,
 'ramadhan': 51,
 'lt': 52,
 'kerja': 53,
 'pandemi': 54,
 'provinsi': 55,
 'dki': 56,
 'bikin': 57,
 'berita': 58,
 'wabah': 59,
 'tau': 60,
 'takut': 61,
 'salah': 62,
 'banget': 63,
 'gw': 64,
 'bersatulawancovid': 65,
 'gt': 66,
 'anak': 67,
 'jd': 68,
 'sikm': 69,
 'dah': 70,
 'biar': 

## Text Classification

In [16]:
x_in = layers.Input(shape=(MAX_COLUMN_LENGTH,))
x = layers.Embedding(input_dim=embeddings.shape[0], 
                    output_dim=embeddings.shape[1], 
                    weights=[embeddings], 
                    input_length=MAX_COLUMN_LENGTH,
                    trainable=False) (x_in)

x = layers.LSTM(units=15, dropout=0.2, return_sequences=True)(x)
                         
x = layers.LSTM(units=15, dropout=0.2)(x)

x = layers.Dense(64, activation='relu')(x)
y_out = layers.Dense(2, activation='softmax')(x)

model = models.Model(x_in, y_out)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

dic_y_mapping = {n:label for n,label in enumerate(np.unique(y_train))}
print(dic_y_mapping)
inverse_dic = {v:k for k,v in dic_y_mapping.items()}
print(inverse_dic)
y = np.array([inverse_dic[y] for y in y_train])
print(y)

X_train = np.array([np.array(val) for val in X_train])

training = model.fit(x=X_train, y=y, batch_size=256, 
                     epochs=10, shuffle=True, verbose=0, 
                     validation_split=0.3)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 595)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 595, 300)          13387200  
_________________________________________________________________
lstm (LSTM)                  (None, 595, 15)           18960     
_________________________________________________________________
lstm_1 (LSTM)                (None, 15)                1860      
_________________________________________________________________
dense (Dense)                (None, 64)                1024      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 13,409,239
Trainable params: 22,039
Non-trainable params: 13,387,200
____________________________________________

In [22]:
X_train

array([[5.4920e+03, 1.2700e+02, 5.2300e+02, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.5700e+02, 8.0500e+02, 5.7640e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.4900e+02, 8.7600e+02, 1.1200e+02, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [7.0000e+00, 0.0000e+00, 1.9451e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.8400e+02, 1.9456e+04, 1.8800e+02, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.2600e+02, 2.8900e+02, 4.9100e+02, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]])

array([array(['5492', 'dajal', 'ga', 'depok', 'bang'], dtype='<U5'),
       array(['detikcom', 'untung', 'depok', 'masuk', 'wilayah', 'nya', 'ridwan',
              'kamil', 'kalo', 'masuk', 'wilayah', 'nya', 'anis', 'abis', 'lu',
              'bully', 'ama', 'buzzer', 'kolam'], dtype='<U8')                   ,
       array(['df', 'dom', 'jakarta', 'depok', 'yg', 'gunain', 'vc', 'cabang',
              'nya', 'cabang', 'yg', 'tercantum', 'pas', 'kesana', 'gabisa',
              'bayar', 'pake', 'shopeepay'], dtype='<U9')                     ,
       ...,
       array(['terawan', 'menyebut', 'virus', 'corona', 'indonesia',
              'terdeteksi', 'minggu', 'kemarin', 'mendengar', 'berita',
              'pemerintah', 'langsung', 'pemeriksaan', 'covid', 'indonesia'],
             dtype='<U11')                                                   ,
       array(['realffk', 'buhari', 'can', 't', 'pronounce', 'corona', 'virus'],
             dtype='<U9')                                   

In [21]:
for i in range(len(X_test)):
    for j in range(len(X_test[i])):
        X_test[i][j] = w2c.wv.key_to_index[X_test[i][j]]
X_test = pad_sequence(X_test, MAX_COLUMN_LENGTH)
X_test = np.array([np.array(val) for val in X_test])

predictions = model.predict(X_test)

KeyError: 'dajal'