In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns

<1> Load train, test dataset

In [None]:
test_text = pd.read_csv("../input/ag-news-classification-dataset/test.csv")
train_text = pd.read_csv("../input/ag-news-classification-dataset/train.csv")

print(f"shape of train dataset >> {train_text.shape}")
print(f"shape of test dataset >> {test_text.shape}")

test_text.sample(5)
train_text.sample(5)

<2> split contents and label

In [None]:
index_to_label ={
    1:'World',
    2:'Sports',
    3:'Business',
    4:'Sci/Tech'
}

def return_dataset(dataset):
    label = dataset["Class Index"]
    label = pd.get_dummies(label)
    data = dataset["Title"] + " " + dataset["Description"]
    print(f"shape of data >> {data.shape}")
    print(f"shape of label >> {label.shape}\n")
    return data,label

train_data,train_label = return_dataset(train_text)
test_data,test_label = return_dataset(test_text)

In [None]:
sns.set_style("whitegrid")

fig = plt.figure(figsize=(8,8))
axe1 = fig.add_subplot(1,2,1)
sns.countplot(data=train_label)
axe1.set_title("train dataset")
axe1.set_xlabel([index_to_label[i] for i in range(1,5)])

axe2 = fig.add_subplot(1,2,2)
sns.countplot(data=test_label)
axe2.set_title("test dataset")
axe2.set_xlabel([index_to_label[i] for i in range(1,5)])


plt.tight_layout()
plt.show()

<3> Preprocess text data (tokenize then apply padding) 

In [None]:
from keras.preprocessing.text import Tokenizer

tok = Tokenizer()
tok.fit_on_texts(train_data)
print(f"numbers of words used >> {len(tok.word_index)}")

word_size = 999
vocab_size = word_size+1 #1000

tok = Tokenizer(num_words=word_size)
tok.fit_on_texts(train_data)

word_index = tok.word_index
index_word = tok.index_word

train_data = tok.texts_to_sequences(train_data)
test_data = tok.texts_to_sequences(test_data)

print("First two samples")
print(train_data[0])
print(train_data[1])

In [None]:
from keras.preprocessing.sequence import pad_sequences

print(f"maximum >> {np.max([len(s)for s in train_data])}")
print(f"minimum >> {np.min([len(s)for s in train_data])}")
print(f"average >> {np.mean([len(s)for s in train_data])}")
print(f"median >> {np.median([len(s)for s in train_data])}\n")

lens = [len(s) for s in train_data]
plt.hist(lens,bins=50)
plt.show()

sequence_len = 50

train_data = pad_sequences(train_data,maxlen=sequence_len,padding='post',truncating='post')
test_data = pad_sequences(test_data,maxlen=sequence_len,padding='post',truncating='post')

print("print first two samples")
print(train_data[0])
print(train_data[1])

print("\ntrain data shape >>",train_data.shape)
print("test data shape >>",test_data.shape)

<4> Make models and train and test

<4-1> Model without RNNs

In [None]:
from keras.layers import Input,Embedding,GlobalAveragePooling1D,Dense,LSTM,Bidirectional,TimeDistributed

def create_simple_model(word_vec_size=64):
    X = Input(shape=[sequence_len])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_len)(X)
    H = GlobalAveragePooling1D()(H)
    Y = Dense(4,activation='softmax')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
from keras.utils import plot_model
from keras.callbacks import ReduceLROnPlateau

reduceLR = ReduceLROnPlateau(monitor='val_loss',patience=3,factor=0.5,min_lr=0.0001,verbose=1)

def fit_test(model,n):
    hist = model.fit(train_data,train_label,batch_size=64,validation_split=0.2,epochs=n,verbose=0,callbacks=[reduceLR])
    result = model.evaluate(test_data,test_label)
    
    return hist,result

In [None]:
simple1 = create_simple_model(64)
fit_test(simple1,7)
plot_model(simple1)


In [None]:
simple2 = create_simple_model(128)
fit_test(simple2,7)
plot_model(simple2)

<4-2> Model with LSTM(RNN) : bidirectional,many-to-one,stacked

In [None]:
from keras.layers import Dropout

def create_LSTM(word_vec_size=64,hidden_size=64):
    X = Input(shape=[sequence_len])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_len)(X)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Bidirectional(LSTM(int(hidden_size/2),return_sequences=True))(H)
    H = GlobalAveragePooling1D()(H)
    
    H = Dropout(0.2)(H)
    H = Dense(1024)(H)
    H = Dropout(0.2)(H)
    H = Dense(256)(H)
    H = Dropout(0.2)(H)
    H = Dense(32)(H)
    H = Dropout(0.1)(H)
    
    Y = Dense(4,activation='softmax')(H)
    # 선택지 2개 : (1)GlobalAveragePooling으로 timestep slice들 하나로 모아주거나
    #             (2)이전 LSTM에서 return_sequences=False 하거나
    
    model = keras.models.Model(X,Y)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
lstm1 = create_LSTM(256,256)
hist = lstm1.fit(train_data,train_label,batch_size=256,validation_split=0.2,epochs=10,verbose=1,callbacks=[reduceLR])
ev = lstm1.evaluate(test_data,test_label)

<4-3> Naive Bayes Classifiers

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

# X_train,y_train = return_dataset(train_text)
# X_test,y_test = return_dataset(test_text)

# print(f"Train dataset shape before preprocessing >> {train_data.shape}")
# print(X_train[22])

# print("\n\nPreprocessing!!\n")
# vectorizer = CountVectorizer()
# transformer = TfidfTransformer()

# train_data_dtm = vectorizer.fit_transform(X_train)
# train_data_tfidf = transformer.fit_transform(train_data_dtm)
# train_data_tfidf = train_data_tfidf.toarray()

# def preprocess(data):
#     result = vectorizer.transform(data)
#     result = transformer.transform(result)
#     result = result.toarray()
#     return result
# print("Preprocessing done!\n\n")

# test_data_tfidf = preprocess(X_test)

# print(f"Train dataset shape after preprocessing >> {train_data_tfidf.shape}")
# print(train_data_tfidf[22])

In [None]:
# y_train = train_text['Class Index']
# y_test = test_text['Class Index']

In [None]:
# # memory 용량 때문에 실행 안됨
# from sklearn.model_selection import train_test_split

# train_data_tfidf,_,y_train,_ = train_test_split(train_data_tfidf,y_train,test_size=0.8,stratify=y_train)

In [None]:
# from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
# from sklearn.metrics import accuracy_score

# gaussian = GaussianNB()
# bernoulli = BernoulliNB()
# multinomial = MultinomialNB()

# def test(model):
#     print(model)
#     model.fit(train_data_tfidf,y_train)
    
#     train_pred = model.predict(train_data_tfidf)
#     train_acc = accuracy_score(y_train,train_pred)
#     print("train accuracy >>", train_acc)
    
#     test_pred = model.predict(test_data_tfidf)
#     test_acc = accuracy_score(y_test,test_pred)
#     print("test accuracy >>", test_acc)
#     print("\n")

In [None]:
# test(gaussian)

(4-4) LSTM (last hidden cell +(concatenate) average of all hidden cells)
<4-2 acc 87% --> let's compare which is better>

In [None]:
from keras.layers import Concatenate,Dropout

def create_new_lstm(word_vec_size=64,hidden_size=64):
    X = Input(shape=[sequence_len])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_len,mask_zero=True)(X)
    
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    all_hidden = Bidirectional(LSTM(int(hidden_size/2),return_sequences=True))(H)
    average = GlobalAveragePooling1D()(all_hidden)
    last_hidden = all_hidden[:,-1,:]
    
    H = Concatenate()([average,last_hidden])
    
    H = Dropout(0.2)(H)
    H = Dense(1024)(H)
    H = Dropout(0.2)(H)
    H = Dense(256)(H)
    H = Dropout(0.2)(H)
    H = Dense(32)(H)
    H = Dropout(0.1)(H)
    
    Y = Dense(4,activation='softmax')(H)
    # 선택지 2개 : (1)GlobalAveragePooling으로 timestep slice들 하나로 모아주거나
    #             (2)이전 LSTM에서 return_sequences=False 하거나
    
    model = keras.models.Model(X,Y)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model


lstm2 = create_new_lstm(256,256)
hist = lstm2.fit(train_data,train_label,batch_size=256,validation_split=0.2,epochs=10,verbose=1,callbacks=[reduceLR])
ev = lstm2.evaluate(test_data,test_label)

(4-5) LSTM (last hidden cell +(concatenate) maximum of all hidden cells)
<4-2 acc 87% --> let's compare which is better>

In [None]:
from keras.layers import GlobalMaxPooling1D

def create_new_lstm(word_vec_size=64,hidden_size=64):
    X = Input(shape=[sequence_len])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_len,mask_zero=True)(X)
    
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    all_hidden = Bidirectional(LSTM(int(hidden_size/2),return_sequences=True))(H)
    average = GlobalMaxPooling1D()(all_hidden)
    last_hidden = all_hidden[:,-1,:]
    
    H = Concatenate()([average,last_hidden])
    
    H = Dropout(0.2)(H)
    H = Dense(1024)(H)
    H = Dropout(0.2)(H)
    H = Dense(256)(H)
    H = Dropout(0.2)(H)
    H = Dense(32)(H)
    H = Dropout(0.1)(H)
    
    Y = Dense(4,activation='softmax')(H)
    # 선택지 2개 : (1)GlobalAveragePooling으로 timestep slice들 하나로 모아주거나
    #             (2)이전 LSTM에서 return_sequences=False 하거나
    
    model = keras.models.Model(X,Y)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model


lstm3 = create_new_lstm(256,256)
hist = lstm3.fit(train_data,train_label,batch_size=256,validation_split=0.2,epochs=10,verbose=1,callbacks=[reduceLR])
ev = lstm3.evaluate(test_data,test_label)