In [1]:
#import pandas dan tensor flow
import tensorflow as tf
import pandas as pd
import numpy as np
import zipfile
import matplotlib.pyplot as plt
import datetime
import random
import os

import nltk, os, re, string

from keras.layers import Input, LSTM, Bidirectional, SpatialDropout1D, Dropout, Flatten, Dense, Embedding, BatchNormalization
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### Pre Processing Data

In [96]:
# Import Data set yang akan digunakan
df = pd.read_csv('bbc-news-data.csv',sep='\t')

df.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [97]:
# Cek jumlah data yang dimiliki masing masing sentiment
df['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [98]:
df.isnull().sum()

category    0
filename    0
title       0
content     0
dtype: int64

In [99]:
df.shape

(2225, 4)

In [100]:
#delete unused column

df = df.drop(columns = 'filename')

##### Case Folding & Cleaning Data

In [101]:
# function to clean the tweets
import re
import string
def cleanTxt(text):
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub('#', '', text)
    text = re.sub('RT[\s]+', '', text)
    text = re.sub('\w+:\/\/\S+', '', text)
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    #remove punctuation
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    text = re.sub('<.*?>', ' ', text)
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub("\n", " ", text)
    text = re.sub(r"\b\w\b", " ", text)
    text = ' '.join(text.split())
    return text


In [102]:
# removing functuation
def cleaner(data):
    return(data.translate(str.maketrans('','', string.punctuation)))
df['title'] = df['title'].apply(lambda x: cleaner(x))
df['content'] = df['content'].apply(lambda x: cleaner(x))

## lematization
lemmatizer = WordNetLemmatizer()

def lem(data):
    pos_dict = {'N': wn.NOUN, 'V': wn.VERB, 'J': wn.ADJ, 'R': wn.ADV}
    return(' '.join([lemmatizer.lemmatize(w,pos_dict.get(t, wn.NOUN)) for w,t in nltk.pos_tag(data.split())]))
df['title'] = df['title'].apply(lambda x: lem(x))
df['content'] = df['content'].apply(lambda x: lem(x))

# removing number
def rem_numbers(data):
    return re.sub('[0-9]+','',data)
df['title'].apply(rem_numbers)
df['content'].apply(rem_numbers)

# removing stopword
st_words = set(stopwords.words('english'))
def stopword(data):
    return(' '.join([w for w in data.split() if w not in st_words ]))
df['title'] = df['title'].apply(lambda x: stopword(x))
df['content'] = df['content'].apply(lambda x: stopword(x))


In [103]:
# MERGE JUDUL & ISI BERITA
df['merged_text'] = df['title'] + ' ' + df['content']

# DO PREPROCESSING
df['merged_text_pre'] = df['merged_text'].apply(cleanTxt)

In [104]:
# view data after cleansing
df.head(10)

Unnamed: 0,category,title,content,merged_text,merged_text_pre
0,business,Ad sale boost Time Warner profit,Quarterly profit US medium giant TimeWarner ju...,Ad sale boost Time Warner profit Quarterly pro...,ad sale boost time warner profit quarterly pro...
1,business,Dollar gain Greenspan speech,The dollar ha hit highest level euro almost th...,Dollar gain Greenspan speech The dollar ha hit...,dollar gain greenspan speech the dollar ha hit...
2,business,Yukos unit buyer face loan claim,The owner embattled Russian oil giant Yukos as...,Yukos unit buyer face loan claim The owner emb...,yukos unit buyer face loan claim the owner emb...
3,business,High fuel price hit BAs profit,British Airways ha blamed high fuel price 40 d...,High fuel price hit BAs profit British Airways...,high fuel price hit bas profit british airways...
4,business,Pernod takeover talk lift Domecq,Shares UK drink food firm Allied Domecq risen ...,Pernod takeover talk lift Domecq Shares UK dri...,pernod takeover talk lift domecq shares uk dri...
5,business,Japan narrowly escape recession,Japans economy teetered brink technical recess...,Japan narrowly escape recession Japans economy...,japan narrowly escape recession japans economy...
6,business,Jobs growth still slow US,The US created fewer job expected January fall...,Jobs growth still slow US The US created fewer...,jobs growth still slow us the us created fewer...
7,business,India call fair trade rule,India attends G7 meeting seven leading industr...,India call fair trade rule India attends G7 me...,india call fair trade rule india attends meeti...
8,business,Ethiopias crop production 24,Ethiopia produced 1427 million tonne crop 2004...,Ethiopias crop production 24 Ethiopia produced...,ethiopias crop production ethiopia produced mi...
9,business,Court reject 280bn tobacco case,A US government claim accusing country biggest...,Court reject 280bn tobacco case A US governmen...,court reject bn tobacco case us government cla...


##Balance the data

In [109]:
# # data category one-hot-encoding
# category = pd.get_dummies(dataframe.category)
# dataframe_cat = pd.concat([dataframe, category], axis=1)
# dataframe_cat = dataframe_cat.drop(columns='category')
# dataframe_cat.tail(10)


# data category one-hot-encoding
category = pd.get_dummies(df.category)
df_cat = pd.concat([df, category], axis=1)
df_cat = df_cat.drop(columns='category')
df_cat.tail(10)

Unnamed: 0,title,content,merged_text,merged_text_pre,business,entertainment,politics,sport,tech
2215,Broadband fuel online expression,Fast web access encouraging people express onl...,Broadband fuel online expression Fast web acce...,broadband fuel online expression fast web acce...,0,0,0,0,1
2216,Savvy searcher fail spot ad,Internet search engine user odd mix naive soph...,Savvy searcher fail spot ad Internet search en...,savvy searcher fail spot ad internet search en...,0,0,0,0,1
2217,TVs future phone line,Internet TV ha talked since start web know But...,TVs future phone line Internet TV ha talked si...,tvs future phone line internet tv ha talked si...,0,0,0,0,1
2218,Cebit fever take Hanover,Thousands product ten thousand visitor make Ce...,Cebit fever take Hanover Thousands product ten...,cebit fever take hanover thousands product ten...,0,0,0,0,1
2219,New console promise big problem,Making game future console require graphic art...,New console promise big problem Making game fu...,new console promise big problem making game fu...,0,0,0,0,1
2220,BT program beat dialler scam,BT introducing two initiative help beat rogue ...,BT program beat dialler scam BT introducing tw...,bt program beat dialler scam bt introducing tw...,0,0,0,0,1
2221,Spam email tempt net shopper,Computer user across world continue ignore sec...,Spam email tempt net shopper Computer user acr...,spam email tempt net shopper computer user acr...,0,0,0,0,1
2222,Be careful code,A new European directive could put software wr...,Be careful code A new European directive could...,be careful code new european directive could p...,0,0,0,0,1
2223,US cyber security chief resigns,The man making sure US computer network safe s...,US cyber security chief resigns The man making...,us cyber security chief resigns the man making...,0,0,0,0,1
2224,Losing online gaming,Online role playing game timeconsuming enthral...,Losing online gaming Online role playing game ...,losing online gaming online role playing game ...,0,0,0,0,1


In [110]:

# change dataframe value to numpy array
news = df_cat['merged_text_pre'].values
label = df_cat[['business', 'entertainment', 'politics', 'sport', 'tech']].values

In [188]:
from sklearn.model_selection import train_test_split
data_latih, data_validasi, label_latih, label_validasi = train_test_split(news, label, test_size = 0.2)

data_latih.shape, label_latih.shape, data_validasi.shape, label_validasi.shape

((1780,), (1780, 5), (445,), (445, 5))

In [189]:
from collections import Counter

Counter(np.argmax(label_latih, axis=1))

Counter({2: 327, 3: 406, 1: 309, 0: 411, 4: 327})

In [190]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=123)

In [191]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer(num_words=10000, oov_token='<oov>')
tokenizer.fit_on_texts(data_latih)
tokenizer.fit_on_texts(data_validasi)


sekuens_latih = tokenizer.texts_to_sequences(data_latih)
sekuens_validasi = tokenizer.texts_to_sequences(data_validasi)

padded_latih = pad_sequences(sekuens_latih, padding = 'post', maxlen = 15, truncating ='post')
padded_validasi = pad_sequences(sekuens_validasi, padding = 'post', maxlen = 15, truncating ='post')

In [192]:
# buat data menjadi balance
padded_latih, label_latih = sm.fit_resample(padded_latih, label_latih)

# cek label data latih
Counter(np.argmax(label_latih, axis=1))

Counter({2: 411, 3: 411, 1: 411, 0: 411, 4: 411})

In [193]:

# model
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=512),
    tf.keras.layers.LSTM(512),
    tf.keras.layers.Dense(2048, activation='relu'),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(optimizer= Adam(learning_rate = 0.001) , metrics=['accuracy'], loss='categorical_crossentropy',)
model.summary()

Model: "sequential_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_23 (Embedding)    (None, None, 512)         5120000   
                                                                 
 lstm_23 (LSTM)              (None, 512)               2099200   
                                                                 
 dense_66 (Dense)            (None, 2048)              1050624   
                                                                 
 dense_67 (Dense)            (None, 1024)              2098176   
                                                                 
 dense_68 (Dense)            (None, 512)               524800    
                                                                 
 dense_69 (Dense)            (None, 256)               131328    
                                                                 
 dropout_28 (Dropout)        (None, 256)             

In [186]:

# callback
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.9 and logs.get('val_accuracy')>0.90):
      self.model.stop_training = True
      print("\nThe accuracy of the training set and the validation set has reached > 90%!")
callbacks = myCallback()


In [187]:
# model fit
history = model.fit(padded_latih, label_latih, epochs=50,
                    validation_data=(padded_validasi, label_validasi), verbose=1,  callbacks=[callbacks])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
The accuracy of the training set and the validation set has reached > 90%!
