    LinkedIn : https://www.linkedin.com/in/nikenamelia/
    Github   : https://github.com/nikenaml


# Info Dataset - AG News Classification Dataset
### Description
The AG's news topic classification dataset is constructed by choosing 4 largest classes from the original corpus. 

Consists of class ids 1-4 where 
- 1-World
- 2-Sports 
- 3-Business 
- 4-Sci/Tech

https://www.kaggle.com/amananandrai/ag-news-classification-dataset

In [None]:
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import string

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 2000)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# join data

from glob import glob

filename = 'dataset_news.csv'

with open(filename, 'a') as singleFile:
    first_csv = True
    for csv in glob('../input/ag-news-classification-dataset/*.csv'):
        if csv == filename:
            pass
        else:
            header = True
            for line in open(csv, 'r'):
                if first_csv and header:
                    singleFile.write(line)
                    first_csv = False
                    header = False
                elif header:
                    header = False
                else:
                    singleFile.write(line)
    singleFile.close()

## Load Data

In [None]:
data = pd.read_csv("./dataset_news.csv")
data.head()

# Data Undestanding

## EDA and get insights

In [None]:
# total data
data.shape

In [None]:
# data info
data.info()

In [None]:
# categories
data['Class Index'].value_counts()

# Data Preparation



In [None]:
# join columns
data['news'] = data['Title'] + ' ' + data['Description']

# rename columns
data = data.rename(columns = {'Class Index': 'category'}, inplace = False)
data.head()

In [None]:
# delete columns (unused column)
data = data.drop(data.columns[[1, 2]], axis=1)
data.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# define labels
labels = ['World News', 'Sports News', 'Business News', 'Sci/Tech News']

ax = sns.countplot(data.category)
plt.xlabel('category news')
ax.set_xticklabels(labels);

# Data Cleaning

In [None]:
# removing punctuation down character
def removePunctuationDown(strs):
    remove = '!#$%&\()+,-./:;<=>?@[\\]_{|}~'
    pattern = r"[{}]".format(remove)
    h = re.sub(pattern, " ", strs)
    return h

# removing punctuation up character
def removePunctuationUp(strs):
    remove = ',"\^`*'
    # remove = ',"\'^`*'
    pattern = r"[{}]".format(remove)
    h = re.sub(pattern, "", strs)
    return h

# replace other special character
def replace(strs):
    strs = strs.replace('\\t',' ').replace('\\n',' ').replace('\\u',' ').replace('\\',' ')
    strs = strs.replace('\n',' ')
    strs = strs.replace('\t','')
    strs = strs.encode('utf-8').decode('ascii', 'ignore')
    return strs

# remove emoji
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002500-\U00002BEF"  # chinese char
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"  # dingbats
                           u"\u3030"
                               #convert data input to Unicode
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
# Apply Cleaning Data
data['news'] = [i.lower() for i in data.news] #lower case
data['remove_emoji'] = data['news'].apply(remove_emoji) #remov emoji
data['hastags'] = [re.findall(r'\B#\w*[a-zA-Z0-9]+\w*', i) for i in data.remove_emoji] #save hastags
data['remove_email'] = [re.sub(r'\S*@\S*\s?','',i) for i in data.remove_emoji] #remove email
data['remove_hashtag'] = [re.sub("#[A-Za-z0-9_]+",'',i) for i in data.remove_emoji] #remove hastags
data['remove_special_character'] = data['remove_hashtag'].replace(r'http\s+|www.\s+','',regex=True).replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True) #remove link
data['remove_special_character'] = [re.sub(r'&lt;/?[a-z]+&gt;','',i) for i in data.remove_special_character] #remove_special_character
data['remove_special_character'] = [re.sub('<.*?>+', '', i) for i in data.remove_special_character] #remove special character
data['text_clean'] = [removePunctuationDown(i) for i in data.remove_special_character] #remove punc down
data['text_clean'] = [removePunctuationUp(i) for i in data.text_clean] #remove punc up
data['text_clean'] = [replace(j) for j in data.text_clean] #remove \n \t

In [None]:
data.tail()

## Custom Cleansing Data

Custom for remove specific word

In [None]:
def text_clean(text):    
    ## Decontract text            
    # specific
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"won\’t", "will not", text)
    text = re.sub(r"can\’t", "can not", text)
    text = re.sub(r"\'t've", " not have", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'clock", "f the clock", text)
    text = re.sub(r"\'cause", " because", text)
    
    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"\’s", " is", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\’m", " am", text)
    text = re.sub(r"\’", "\'", text)
    
    # remove all puctuation
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text) #remove digit/number
    
    # remove all special characters
    text = re.sub(r'\W', ' ', text)

    # remove break
    text = re.sub('[‘’“”…]', '', text)
    
    # remove prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    # substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    return text

data['text_clean'] = data['text_clean'].apply(lambda x: text_clean(x))

In [None]:
data.head()

# Data Preprocessing

In [None]:
# select columns
data = data[['news', 'text_clean', 'category']]
data.head()

## Lemmatization

In [None]:
# import and download packages
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
# import library
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()

# define tag and lemmatization function
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)



# lemmatizing
data['text_lemma'] = data['text_clean'].apply(lambda x: lemmatize_sentence(x))

data = data[['text_clean','text_lemma','category']]
data.head()

## Stopword

In [None]:
stop = stopwords.words('english')

data['text_preprocess'] = data['text_lemma'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data.head()

# Data Process

In [None]:
#convert dataframe to values
data_process = data['text_preprocess'].values
label = data['category'].apply(lambda x:x-1).values

In [None]:
# view data array
data_process[0:10]

In [None]:
# view label array
label

## Data Splitting

In [None]:
# split data into training and validation

from sklearn.model_selection import train_test_split

text_train, text_test, label_train, label_test = train_test_split(data_process, label, test_size=0.2, shuffle=True,  stratify=label, random_state=42)

text_train.shape, text_test.shape, label_train.shape, label_test.shape

# Model

In [None]:
# tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
tokenizer = Tokenizer(num_words=10000, oov_token='x')
tokenizer.fit_on_texts(text_train) 
tokenizer.fit_on_texts(text_test)

word_index = tokenizer.word_index
total_words = len(word_index)+1
 
sekuens_train = tokenizer.texts_to_sequences(text_train)
sekuens_test = tokenizer.texts_to_sequences(text_test)
 
padded_train = pad_sequences(sekuens_train, maxlen=100) 
padded_test = pad_sequences(sekuens_test, maxlen=100)

In [None]:
print(f'train datashape : {padded_train.shape}')
print(f'test datashape : {padded_test.shape}')
total_words

In [None]:
# import packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers,Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

In [None]:
# model
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=total_words,output_dim=128,input_length=100),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam', metrics=['accuracy'], loss='sparse_categorical_crossentropy')
model.summary()

In [None]:
# callback

class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.91 and logs.get('val_accuracy')>0.91):
      self.model.stop_training = True
      print("\nThe accuracy of the training set and the validation set has reached > 91%!")
callbacks = myCallback()

In [None]:
import time
time_array = []

start_time = time.time()

# model fit
history = model.fit(padded_train, label_train, 
                    batch_size=256, 
                    epochs=30, 
                    validation_data=(padded_test, label_test),
                    verbose=2, callbacks=[callbacks],
                    validation_steps=5, steps_per_epoch=25)

print("--- %.2f menit ---" % ((time.time() - start_time)/60))
time_array.append(time.time() - start_time)

# Plot Visualization Accuracy and Loss Each Epoch


In [None]:
# define function plot visualization training and validation process
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    print("Training Accuracy: %.2f" % acc[-1])
    print("Training Loss: %.2f" % loss[-1])
    print("Validation Accuracy: %.2f" % val_acc[-1])
    print("Validation Loss: %.2f" % val_loss[-1] + '\n\n')

    # make a visualization of the results of the neural network between loss, validation loss and accuracy, validation accuracy
    plt.figure(figsize=(15, 5))

    # create visualizations for accuracy values in the training and validation process
    plt.subplot(1, 2, 1)
    plt.plot(acc, 'b', label='Train acc')
    plt.plot(val_acc, 'r', label='Validation acc')
    plt.title('Train and validation accuracy Visualization')
    plt.xlabel("Jumlah Epochs")
    plt.legend()

    # create visualizations for loss values in the training and validation process
    plt.subplot(1, 2, 2)
    plt.plot(loss, 'b', label='Train loss')
    plt.plot(val_loss, 'r', label='Validation loss')
    plt.title('Train and validation loss Visualization')
    plt.legend()

In [None]:
plot_history(history)