
# Title

### Auto Categorizing News Article based on the Content

# Introduction

### Name : Sonny Riadi


`Project Description :`

In this project we gonna make a text classification using Natural Language Processing to determine the category of the article. 


# External Link

1. Github [link](https://github.com/sonnyrd/Project/tree/main/7_Hacktiv8_DS_Talent_Fair/DSTF-2-DS-MMC-PORTAL)
2. Deployment [link](https://sonnyrd-ds-talent-fair.herokuapp.com/)


# Working Area
This is where you handle the task


###  I. Import Library

In [14]:
import pandas as pd
import numpy as np
import requests
import json
import ast
import re
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup



# Stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# label encoder
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Tokenizer
from keras.preprocessing.text import Tokenizer

# Padding
from tensorflow.keras.preprocessing.sequence import pad_sequences

# splitting Dataset
from sklearn.model_selection import train_test_split

#tf
import tensorflow as tf

# modelling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

# save model
import pickle


# metrics
from sklearn.metrics import classification_report

# disable warning
import warnings
warnings.filterwarnings("ignore")

### II. Loading Dataset

In [15]:
df = pd.read_csv('Dataset/dataset_final.csv')
df.head(10)

Unnamed: 0,category_id,category_name,content_date,content_id,content_name,content_title,summary,tagging,content
0,13,Humor,2022-11-29T03:09:20.728+00:00,be0eddd47b0c4994842293bdaa7eddcc,article,"Kembali Datangkan Murid Internasional, Kali In...","JAKARTA, celebrities.id - Dikenal sebagai pres...","[{'backgroundimage': '', 'created': '2022-02-1...","<p><strong>JAKARTA, celebrities.id - </strong>..."
1,13,Humor,2022-11-29T03:04:23.689+00:00,e2e2dd9d819044cbb43e2f7e4f4bce5a,article,"Kembali Datangkan Murid Internasional, Kali In...",JAKARTA - Dikenal sebagai presenter yang hits ...,"[{'backgroundimage': '', 'created': '2021-11-0...",<p><strong>JAKARTA - </strong>Dikenal sebagai ...
2,13,Humor,2022-11-28T17:42:06.887+00:00,ea653310f8fe446c9219a80176ecbd97,article,Putri Donald Trump Akrab dengan Perdana Menter...,Putri Donald Trump terlihat akrab dengan Per...,,<p><strong>GenPI.co - </strong> Ivanka Trump y...
3,13,Humor,2022-11-28T04:58:16.643+00:00,2a0ef77058224bcdb1735e7ea058cb09,article,Elon Musk: Kesalahan Besar Blokir Akun Twitter...,"Pemilik Twitter, Elon Musk menyebut, aksi mela...","[{'backgroundimage': '', 'created': '2022-11-2...","<p>Pemilik Twitter, Elon Musk menyebut, aksi m..."
4,13,Humor,2022-11-28T02:32:20.03+00:00,099f4c67f5454afe947ab2a8be9045aa,article,Elon Musk Sebut Aksi Larang Akun Twitter Trump...,"JAKARTA, NETRALNEWS.COM - Pemilik Twitter yang...","[{'backgroundimage': '', 'created': '2022-11-2...","<p><b style=""font-size: 0.875rem;"">JAKARTA, NE..."
5,13,Humor,2022-11-28T02:12:25.551+00:00,6245295d524a497398e8f492013640db,article,Aksi Larang Akun Twitter Trump saat Menjabat P...,"JAKARTA, NETRALNEWS.COM - Pemilik Twitter yang...","[{'backgroundimage': '', 'created': '2022-11-2...","<p><b style=""font-size: 0.875rem;"">JAKARTA, NE..."
6,13,Humor,2022-11-26T01:26:00.292+00:00,685bd06f53414cc1b7085e290aed6f86,article,Donald Trump Belum Juga Nge-tweet sejak Akunny...,"WASHINGTON, iNews.id - Miliarder Elon Musk kem...","[{'backgroundimage': '', 'created': '2022-11-2...","<p><strong>WASHINGTON, iNews.id</strong> - Mil..."
7,13,Humor,2022-11-25T12:05:25.823+00:00,b3d7ec50b4c34f68a860180da8b89241,article,"Ditemani Deng Ical, IAS Sowan ke Sejumlah Elit...","RAKYATKU.COM, KEPULAUAN SELAYAR - Kandidat gub...","[{'backgroundimage': '', 'created': '2022-04-0...","<p><strong>RAKYATKU.COM, KEPULAUAN SELAYAR</st..."
8,13,Humor,2022-11-25T04:00:59.58+00:00,cbc92db4f29c4d01ae3a6fb706a1f9cb,article,"Donald Trump Disebut Meninggal, Hoaks Lagi, Ho...",Hoaks berisi Mantan Presiden Amerika Serikat...,,<p><strong>GenPI.co - </strong> Hoaks berisi M...
9,13,Humor,2022-11-24T23:40:15.425+00:00,3235196a9efa4280b29ad119dbbe0c3e,article,Selamat! Bill Gates Bakal Jadi Kakek,"BUKAMATA - Putri sulung Bill Gates, Jennifer G...","[{'backgroundimage': '', 'created': '2022-05-2...","<p class=""p1""><span class=""s1""><strong>BUKAMAT..."


In [16]:
df.shape

(17976, 9)

### III. Data Cleaning & Preparation

##### 1. Remove Duplicates Content ID

In [17]:
df.drop_duplicates(subset='content_id',inplace=True)
df.head(10)

Unnamed: 0,category_id,category_name,content_date,content_id,content_name,content_title,summary,tagging,content
0,13,Humor,2022-11-29T03:09:20.728+00:00,be0eddd47b0c4994842293bdaa7eddcc,article,"Kembali Datangkan Murid Internasional, Kali In...","JAKARTA, celebrities.id - Dikenal sebagai pres...","[{'backgroundimage': '', 'created': '2022-02-1...","<p><strong>JAKARTA, celebrities.id - </strong>..."
1,13,Humor,2022-11-29T03:04:23.689+00:00,e2e2dd9d819044cbb43e2f7e4f4bce5a,article,"Kembali Datangkan Murid Internasional, Kali In...",JAKARTA - Dikenal sebagai presenter yang hits ...,"[{'backgroundimage': '', 'created': '2021-11-0...",<p><strong>JAKARTA - </strong>Dikenal sebagai ...
2,13,Humor,2022-11-28T17:42:06.887+00:00,ea653310f8fe446c9219a80176ecbd97,article,Putri Donald Trump Akrab dengan Perdana Menter...,Putri Donald Trump terlihat akrab dengan Per...,,<p><strong>GenPI.co - </strong> Ivanka Trump y...
3,13,Humor,2022-11-28T04:58:16.643+00:00,2a0ef77058224bcdb1735e7ea058cb09,article,Elon Musk: Kesalahan Besar Blokir Akun Twitter...,"Pemilik Twitter, Elon Musk menyebut, aksi mela...","[{'backgroundimage': '', 'created': '2022-11-2...","<p>Pemilik Twitter, Elon Musk menyebut, aksi m..."
4,13,Humor,2022-11-28T02:32:20.03+00:00,099f4c67f5454afe947ab2a8be9045aa,article,Elon Musk Sebut Aksi Larang Akun Twitter Trump...,"JAKARTA, NETRALNEWS.COM - Pemilik Twitter yang...","[{'backgroundimage': '', 'created': '2022-11-2...","<p><b style=""font-size: 0.875rem;"">JAKARTA, NE..."
5,13,Humor,2022-11-28T02:12:25.551+00:00,6245295d524a497398e8f492013640db,article,Aksi Larang Akun Twitter Trump saat Menjabat P...,"JAKARTA, NETRALNEWS.COM - Pemilik Twitter yang...","[{'backgroundimage': '', 'created': '2022-11-2...","<p><b style=""font-size: 0.875rem;"">JAKARTA, NE..."
6,13,Humor,2022-11-26T01:26:00.292+00:00,685bd06f53414cc1b7085e290aed6f86,article,Donald Trump Belum Juga Nge-tweet sejak Akunny...,"WASHINGTON, iNews.id - Miliarder Elon Musk kem...","[{'backgroundimage': '', 'created': '2022-11-2...","<p><strong>WASHINGTON, iNews.id</strong> - Mil..."
7,13,Humor,2022-11-25T12:05:25.823+00:00,b3d7ec50b4c34f68a860180da8b89241,article,"Ditemani Deng Ical, IAS Sowan ke Sejumlah Elit...","RAKYATKU.COM, KEPULAUAN SELAYAR - Kandidat gub...","[{'backgroundimage': '', 'created': '2022-04-0...","<p><strong>RAKYATKU.COM, KEPULAUAN SELAYAR</st..."
8,13,Humor,2022-11-25T04:00:59.58+00:00,cbc92db4f29c4d01ae3a6fb706a1f9cb,article,"Donald Trump Disebut Meninggal, Hoaks Lagi, Ho...",Hoaks berisi Mantan Presiden Amerika Serikat...,,<p><strong>GenPI.co - </strong> Hoaks berisi M...
9,13,Humor,2022-11-24T23:40:15.425+00:00,3235196a9efa4280b29ad119dbbe0c3e,article,Selamat! Bill Gates Bakal Jadi Kakek,"BUKAMATA - Putri sulung Bill Gates, Jennifer G...","[{'backgroundimage': '', 'created': '2022-05-2...","<p class=""p1""><span class=""s1""><strong>BUKAMAT..."


In [18]:
df.shape

(17974, 9)

##### 2. Drop Missing Values

In [19]:
df.dropna(subset=['content'],inplace=True)

In [20]:
df.reset_index(drop=True,inplace=True)
df.head(10)

Unnamed: 0,category_id,category_name,content_date,content_id,content_name,content_title,summary,tagging,content
0,13,Humor,2022-11-29T03:09:20.728+00:00,be0eddd47b0c4994842293bdaa7eddcc,article,"Kembali Datangkan Murid Internasional, Kali In...","JAKARTA, celebrities.id - Dikenal sebagai pres...","[{'backgroundimage': '', 'created': '2022-02-1...","<p><strong>JAKARTA, celebrities.id - </strong>..."
1,13,Humor,2022-11-29T03:04:23.689+00:00,e2e2dd9d819044cbb43e2f7e4f4bce5a,article,"Kembali Datangkan Murid Internasional, Kali In...",JAKARTA - Dikenal sebagai presenter yang hits ...,"[{'backgroundimage': '', 'created': '2021-11-0...",<p><strong>JAKARTA - </strong>Dikenal sebagai ...
2,13,Humor,2022-11-28T17:42:06.887+00:00,ea653310f8fe446c9219a80176ecbd97,article,Putri Donald Trump Akrab dengan Perdana Menter...,Putri Donald Trump terlihat akrab dengan Per...,,<p><strong>GenPI.co - </strong> Ivanka Trump y...
3,13,Humor,2022-11-28T04:58:16.643+00:00,2a0ef77058224bcdb1735e7ea058cb09,article,Elon Musk: Kesalahan Besar Blokir Akun Twitter...,"Pemilik Twitter, Elon Musk menyebut, aksi mela...","[{'backgroundimage': '', 'created': '2022-11-2...","<p>Pemilik Twitter, Elon Musk menyebut, aksi m..."
4,13,Humor,2022-11-28T02:32:20.03+00:00,099f4c67f5454afe947ab2a8be9045aa,article,Elon Musk Sebut Aksi Larang Akun Twitter Trump...,"JAKARTA, NETRALNEWS.COM - Pemilik Twitter yang...","[{'backgroundimage': '', 'created': '2022-11-2...","<p><b style=""font-size: 0.875rem;"">JAKARTA, NE..."
5,13,Humor,2022-11-28T02:12:25.551+00:00,6245295d524a497398e8f492013640db,article,Aksi Larang Akun Twitter Trump saat Menjabat P...,"JAKARTA, NETRALNEWS.COM - Pemilik Twitter yang...","[{'backgroundimage': '', 'created': '2022-11-2...","<p><b style=""font-size: 0.875rem;"">JAKARTA, NE..."
6,13,Humor,2022-11-26T01:26:00.292+00:00,685bd06f53414cc1b7085e290aed6f86,article,Donald Trump Belum Juga Nge-tweet sejak Akunny...,"WASHINGTON, iNews.id - Miliarder Elon Musk kem...","[{'backgroundimage': '', 'created': '2022-11-2...","<p><strong>WASHINGTON, iNews.id</strong> - Mil..."
7,13,Humor,2022-11-25T12:05:25.823+00:00,b3d7ec50b4c34f68a860180da8b89241,article,"Ditemani Deng Ical, IAS Sowan ke Sejumlah Elit...","RAKYATKU.COM, KEPULAUAN SELAYAR - Kandidat gub...","[{'backgroundimage': '', 'created': '2022-04-0...","<p><strong>RAKYATKU.COM, KEPULAUAN SELAYAR</st..."
8,13,Humor,2022-11-25T04:00:59.58+00:00,cbc92db4f29c4d01ae3a6fb706a1f9cb,article,"Donald Trump Disebut Meninggal, Hoaks Lagi, Ho...",Hoaks berisi Mantan Presiden Amerika Serikat...,,<p><strong>GenPI.co - </strong> Hoaks berisi M...
9,13,Humor,2022-11-24T23:40:15.425+00:00,3235196a9efa4280b29ad119dbbe0c3e,article,Selamat! Bill Gates Bakal Jadi Kakek,"BUKAMATA - Putri sulung Bill Gates, Jennifer G...","[{'backgroundimage': '', 'created': '2022-05-2...","<p class=""p1""><span class=""s1""><strong>BUKAMAT..."


##### Select Category

In [21]:
# df = df[
#         (df['category_name'].isin(['Politik & Peristiwa','Keuangan dan Bisnis','Hiburan','Olahraga','Ekonomi']))
        
#     ]
# df

df = df[
        (df['category_name'] != 'Humor')
        
    ]
df




Unnamed: 0,category_id,category_name,content_date,content_id,content_name,content_title,summary,tagging,content
109,21,Ekonomi,2022-11-02T01:21:34.696+00:00,ee599421a28442beb7a62d49bc5d288d,article,"Kemnaker Turun Tangan Panggil Bos Waroeng SS, ...","JAKARTA, iNewsSragen.id - Gaji Karyawan Peneri...","[{'backgroundimage': '', 'created': '2022-01-0...","<p><strong>JAKARTA, iNewsSragen.id</strong> - ..."
140,34,Politik & Peristiwa,2022-10-17T15:54:12.528+00:00,823a4dc6037241c9b479ec66382c4e5b,article,Miris Oknum ASN Takalar Aniaya hingga Rudapaks...,"PORTALMEDIA.ID, MAKASSAR- Sungguh malang nasib...","[{'backgroundimage': '', 'created': '2022-08-0...","<p>PORTALMEDIA.ID, MAKASSAR- Sungguh malang na..."
148,34,Politik & Peristiwa,2022-10-14T19:11:24.848+00:00,6f8dd91b66064ee1810184d95ee10afb,article,IRT Tewas dengan Mulut Tersumpal Celana Dalam ...,"BOYOLALI, INEWS.TTU.ID Seorang IRT Tewas deng...","[{'backgroundimage': '', 'created': '2022-10-1...","<p><br /><strong>BOYOLALI, INEWS.TTU.ID </stro..."
150,22,Gaya Hidup,2022-10-13T23:48:48.007+00:00,6769499dfd50413886b53d76085c818b,article,"Berwajah Ganteng dan Imut, 5 Aktor Korea Berak...","JAKARTA, INEWSTTU.ID- Drama Korea selalu menja...","[{'backgroundimage': '', 'created': '2021-09-1...","<div><strong>JAKARTA, INEWSTTU.ID</strong>- Dr..."
165,30,Olahraga,2022-10-11T09:40:43.327+00:00,e91CCA,article,"Jordan Henderson Diduga Berkata Rasis, FA Inve...",Arsenal berhasil menumbangkan Liverpool dengan...,"[{'backgroundimage': '', 'created': '2022-10-1...","<p><span style=""font-weight: 400;"">Arsenal ber..."
...,...,...,...,...,...,...,...,...,...
17957,36,Games,2022-11-05T08:27:12.332+00:00,b45b94df77c043f2b0dcbefe36af29ca,article,Trailer terbaru &#39;The Devil In Me&#39; mema...,Setelah mengumumkan cerita serta tanggal rili...,"[{'backgroundimage': '', 'created': '2022-08-2...",<p>Setelah mengumumkan cerita serta tanggal ri...
17958,36,Games,2022-11-05T08:03:04.032+00:00,42473356d30c4637adb1a396c39f8a4a,article,Google Play Games Versi Beta Sudah Bisa Diundu...,"Pada upacara Penghargaan Game tahun lalu, Goo...",,"<p>Pada upacara Penghargaan Game tahun lalu, G..."
17959,36,Games,2022-11-05T08:03:03.681+00:00,0ea2404d8e18433ab6d1b097152acce9,article,"Waduh, Perilisan Game Football Manager 2023 Un...",Sega dan Sports Interactive mengumumkan bahwa...,,<p>Sega dan Sports Interactive mengumumkan bah...
17960,36,Games,2022-11-05T07:23:12.662+00:00,4b78df539c4d4a699b9e04b7fcb47e8a,article,Kumpulan Kode Redeem FF Free Fire MAX Sabtu 5 ...,"Kode Redeem FF Free Fire MAX, Sabtu 5 November...","[{'backgroundimage': '', 'created': '2021-11-1...","<p><strong>JAKARTA </strong><a href=""https://w..."


In [22]:
# df = df.groupby('category_name', group_keys=False).apply(lambda x: x.sample(min(len(x), 500)).sample(frac=1))

In [23]:
df['category_name'].value_counts()

Politik & Peristiwa    1164
Keuangan dan Bisnis    1095
Hiburan                1092
Olahraga               1081
Wisata dan Kuliner     1074
Gaya Hidup             1072
Kebijakan Publik       1051
Ekonomi                1021
Otomotif               1011
Kesehatan              1009
Gawai                  1007
Games                  1007
Luxury                 1005
Musik dan Film         1002
Teknologi              1001
Budaya                  998
Kecantikan              993
Name: category_name, dtype: int64

##### 6. Data Cleansing

In [24]:
def clean_text(text):
    try:
        # remove html tag
        text = re.sub(r'<strong>.*?</strong>',' ',text)
        # remove html tag
        text = re.sub(r'<.*?>',' ',text)
        # remove first header jakarta, inews.id -
        text = re.sub(r'^.+? -',' ',text)
        # remove link url
        text = re.sub(r'https?:\/\/.*?[\s+]',' ',text)
        # remove symbol
        text = re.sub('[^a-zA-Z]',' ',text)
        text = ' '.join(text.split())
        text = text.lower()
        return text
    except: None

In [25]:
df['content_clean'] = df['content'].apply(lambda x: clean_text(x))
df.head(10)

Unnamed: 0,category_id,category_name,content_date,content_id,content_name,content_title,summary,tagging,content,content_clean
109,21,Ekonomi,2022-11-02T01:21:34.696+00:00,ee599421a28442beb7a62d49bc5d288d,article,"Kemnaker Turun Tangan Panggil Bos Waroeng SS, ...","JAKARTA, iNewsSragen.id - Gaji Karyawan Peneri...","[{'backgroundimage': '', 'created': '2022-01-0...","<p><strong>JAKARTA, iNewsSragen.id</strong> - ...",gaji karyawan penerima bsu dipotong rp ribu ke...
140,34,Politik & Peristiwa,2022-10-17T15:54:12.528+00:00,823a4dc6037241c9b479ec66382c4e5b,article,Miris Oknum ASN Takalar Aniaya hingga Rudapaks...,"PORTALMEDIA.ID, MAKASSAR- Sungguh malang nasib...","[{'backgroundimage': '', 'created': '2022-08-0...","<p>PORTALMEDIA.ID, MAKASSAR- Sungguh malang na...",portalmedia id makassar sungguh malang nasib s...
148,34,Politik & Peristiwa,2022-10-14T19:11:24.848+00:00,6f8dd91b66064ee1810184d95ee10afb,article,IRT Tewas dengan Mulut Tersumpal Celana Dalam ...,"BOYOLALI, INEWS.TTU.ID Seorang IRT Tewas deng...","[{'backgroundimage': '', 'created': '2022-10-1...","<p><br /><strong>BOYOLALI, INEWS.TTU.ID </stro...",seorang irt tewas dengan mulut tersumpal celan...
150,22,Gaya Hidup,2022-10-13T23:48:48.007+00:00,6769499dfd50413886b53d76085c818b,article,"Berwajah Ganteng dan Imut, 5 Aktor Korea Berak...","JAKARTA, INEWSTTU.ID- Drama Korea selalu menja...","[{'backgroundimage': '', 'created': '2021-09-1...","<div><strong>JAKARTA, INEWSTTU.ID</strong>- Dr...",drama korea selalu menjadi incaran kaum hawa j...
165,30,Olahraga,2022-10-11T09:40:43.327+00:00,e91CCA,article,"Jordan Henderson Diduga Berkata Rasis, FA Inve...",Arsenal berhasil menumbangkan Liverpool dengan...,"[{'backgroundimage': '', 'created': '2022-10-1...","<p><span style=""font-weight: 400;"">Arsenal ber...",arsenal berhasil menumbangkan liverpool dengan...
167,22,Gaya Hidup,2022-10-11T01:57:09.2+00:00,a33225ed92e34ccd99801366364f1ae8,article,5 Rekomendasi Bisnis Kerajinan Tangan Berawal ...,"BATU, iNews.id - Berawal dari sebuah hobi bisa...","[{'backgroundimage': '', 'created': '2022-10-1...","<p style=""text-align:justify""><strong>BATU, iN...",berawal dari sebuah hobi bisa berubah menjadi ...
169,31,Otomotif,2022-10-10T22:57:11.412+00:00,e9465d756cad452bbb7665a94cb2d3a0,article,"Tidak Hanya Soal Menilang, Satlantas Polres Ng...","NGAWI, iNewsNgawi.id -Masih dalam rangka gelar...","[{'backgroundimage': '', 'created': '2022-10-0...","<p><strong>NGAWI, iNewsNgawi.id -</strong>Masi...",mantingan km desa pengkol mantingan menurut ka...
173,23,Hiburan,2022-10-09T00:15:01.31+00:00,59441af18f0e409ebc4181c36f758e5b,article,Cara Menghilangkan Jin dan Sihir dari Tubuh Ma...,"JAKARTA, iNewsPandeglang.id Sering merasa tid...","[{'backgroundimage': '', 'created': '2022-09-0...","<p><strong>JAKARTA, iNewsPandeglang.id</strong...",sering merasa tidak enak tergangu dengan jin k...
175,22,Gaya Hidup,2022-10-08T07:55:23.467+00:00,074e0d5a2e4b45db86c46eb785804d23,article,"Terkenal Mental Baja, Weton Minggu Pon Punya D...","BONDOWOSO, iNewsBondowoso.id -Berdasarkan Prim...","[{'backgroundimage': '', 'created': '2022-10-0...","<p><strong>BONDOWOSO</strong>, <strong>iNewsBo...",berdasarkan primbon jawa setiap weton memiliki...
183,32,Gawai,2022-10-06T04:40:00.151+00:00,IO6lIh,article,"Cari Gadget Accessories di AladinMall Aja, Ad...","Kini, cari gadget accessories nggak perlu repo...",[{'backgroundimage': 'https://i.buddyku.id/ugc...,"<p>Kini, cari gadget accessories nggak perlu r...",kini cari gadget accessories nggak perlu repot...


### IV. Data Preprocessing

##### 1. Label Encoder

In [26]:
le = LabelEncoder()
y = le.fit_transform(df['category_name'])
y = to_categorical(y)


In [27]:
y

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

Save labelencoder.pkl

In [28]:
with open('labelencoder.pkl', 'wb') as f:
    pickle.dump(le, f, protocol=pickle.HIGHEST_PROTOCOL)


##### 2. Removing Stopwords

In [29]:
factory = StopWordRemoverFactory()
stopwords = factory.create_stop_word_remover()

In [30]:
df['content_clean'] = df['content_clean'].apply(lambda x : stopwords.remove(x))

In [31]:
df.head(5)

Unnamed: 0,category_id,category_name,content_date,content_id,content_name,content_title,summary,tagging,content,content_clean
109,21,Ekonomi,2022-11-02T01:21:34.696+00:00,ee599421a28442beb7a62d49bc5d288d,article,"Kemnaker Turun Tangan Panggil Bos Waroeng SS, ...","JAKARTA, iNewsSragen.id - Gaji Karyawan Peneri...","[{'backgroundimage': '', 'created': '2022-01-0...","<p><strong>JAKARTA, iNewsSragen.id</strong> - ...",gaji karyawan penerima bsu dipotong rp ribu ke...
140,34,Politik & Peristiwa,2022-10-17T15:54:12.528+00:00,823a4dc6037241c9b479ec66382c4e5b,article,Miris Oknum ASN Takalar Aniaya hingga Rudapaks...,"PORTALMEDIA.ID, MAKASSAR- Sungguh malang nasib...","[{'backgroundimage': '', 'created': '2022-08-0...","<p>PORTALMEDIA.ID, MAKASSAR- Sungguh malang na...",portalmedia id makassar sungguh malang nasib s...
148,34,Politik & Peristiwa,2022-10-14T19:11:24.848+00:00,6f8dd91b66064ee1810184d95ee10afb,article,IRT Tewas dengan Mulut Tersumpal Celana Dalam ...,"BOYOLALI, INEWS.TTU.ID Seorang IRT Tewas deng...","[{'backgroundimage': '', 'created': '2022-10-1...","<p><br /><strong>BOYOLALI, INEWS.TTU.ID </stro...",seorang irt tewas mulut tersumpal celana didug...
150,22,Gaya Hidup,2022-10-13T23:48:48.007+00:00,6769499dfd50413886b53d76085c818b,article,"Berwajah Ganteng dan Imut, 5 Aktor Korea Berak...","JAKARTA, INEWSTTU.ID- Drama Korea selalu menja...","[{'backgroundimage': '', 'created': '2021-09-1...","<div><strong>JAKARTA, INEWSTTU.ID</strong>- Dr...",drama korea selalu menjadi incaran kaum hawa j...
165,30,Olahraga,2022-10-11T09:40:43.327+00:00,e91CCA,article,"Jordan Henderson Diduga Berkata Rasis, FA Inve...",Arsenal berhasil menumbangkan Liverpool dengan...,"[{'backgroundimage': '', 'created': '2022-10-1...","<p><span style=""font-weight: 400;"">Arsenal ber...",arsenal berhasil menumbangkan liverpool skor s...


##### 3. Stemming

In [35]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [37]:
for i in df.index:
    print(i,end='\r')
    df['content_clean'][i] = stemmer.stem(df['content_clean'][i])

635

In [580]:
# df['content_clean'] = df['content_clean'].apply(lambda x : stemmer.stem(x))

#####  2. Tokenizer

In [581]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['content_clean'])

In [582]:
num_words = len(tokenizer.word_index)

In [583]:
len(tokenizer.word_index)

112988

In [584]:
token_seq = tokenizer.texts_to_sequences(df['content_clean'])

Save tokenizer.pkl

In [585]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)


##### 4. Padding

In [586]:
num_tokens = [len(tokens) for tokens in token_seq]
num_tokens = np.array(num_tokens)

In [587]:
np.mean(num_tokens)

251.44958434654754

In [588]:
np.max(num_tokens)

5222

In [589]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

596

In [590]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.9650511790985693

In [591]:
padded_seq = pad_sequences(token_seq, maxlen=max_tokens,padding='post', truncating='post')

In [592]:
padded_seq.shape

(17683, 596)

##### 5. Splitting Dataset

In [593]:
X_train, X_val, y_train, y_val = train_test_split(padded_seq, y, 
                                                    test_size=0.2, 
                                                    random_state=12,stratify=y)

##### 6. Dataset Pipeline

In [594]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train,y_train)).batch(128)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val,y_val)).batch(128)

### V. Model Definition

In [595]:
model = Sequential()
model.add(Embedding(input_dim=num_words+1,
                    output_dim=64,
                    input_length=max_tokens,
                    name='layer_embedding'))
model.add(Bidirectional(LSTM(256)))
model.add(Dense(256, activation='relu')),
model.add(Dropout(rate=0.1)),
model.add(Dense(128, activation='relu')),
model.add(Dropout(rate=0.1)),
model.add(Dense(17, activation='sigmoid'))

In [596]:
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 layer_embedding (Embedding)  (None, 596, 64)          7231296   
                                                                 
 bidirectional_15 (Bidirecti  (None, 512)              657408    
 onal)                                                           
                                                                 
 dense_45 (Dense)            (None, 256)               131328    
                                                                 
 dropout_30 (Dropout)        (None, 256)               0         
                                                                 
 dense_46 (Dense)            (None, 128)               32896     
                                                                 
 dropout_31 (Dropout)        (None, 128)               0         
                                                     

In [597]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics= ['accuracy'])

### VI. Model Training

In [598]:
callbacks= [ 
    EarlyStopping(patience = 10), 
    ModelCheckpoint('model_best.hdf5', monitor='val_accuracy', save_best_only=True)
]

In [599]:
history = model.fit(train_dataset,epochs=100, validation_data=val_dataset,callbacks= callbacks)

Epoch 1/100
 10/111 [=>............................] - ETA: 13:16 - loss: 0.5020 - accuracy: 0.0516

KeyboardInterrupt: 

### VII. Model Evaluation

In [None]:
def plot_history(history_name):
    plt.figure(figsize=(16,8))
    plt.suptitle('Sequential API',fontsize=20)

    plt.subplot(1,2,1)
    plt.plot(history_name.history['loss'], label='Training Acc')
    plt.plot(history_name.history['val_loss'], label='Val Acc')
    plt.title('loss')
    plt.ylabel('Value') 
    plt.xlabel('Epoch')

    plt.subplot(1,2,2)
    plt.plot(history_name.history['accuracy'], label='Training Acc')
    plt.plot(history_name.history['val_accuracy'], label='Val Acc')
    plt.title('Accuracy')
    plt.ylabel('Value') 
    plt.xlabel('Epoch')


    plt.show()

plot_history(history)


##### Evaluate Train and Validation

In [None]:
train_eval = model.evaluate(train_dataset)

In [None]:
validation_eval = model.evaluate(val_dataset)

##### Classification Report

Train Set

In [None]:
train_pred = model.predict(train_dataset)

In [None]:
train_pred = np.argmax(train_pred,axis=1)
y = np.argmax(y_train,axis=1)

In [None]:
print(classification_report(le.inverse_transform(train_pred),le.inverse_transform(y)))

Validation Set

In [None]:
val_pred = model.predict(val_dataset)

In [None]:
val_pred = np.argmax(val_pred,axis=1)
y = np.argmax(y_val,axis=1)

In [None]:
print(classification_report(le.inverse_transform(val_pred),le.inverse_transform(y)))

In [None]:
url = f"https://buddyku.com/api/content/detail?content_id=741155bb3fd84e4485309730a295b7f5"
response = requests.get(url).content
content = json.loads(response)
data = content['values'][0]['content']

In [None]:
def inference(data):
    data = clean_text(data)
    data = tokenizer.texts_to_sequences([data])
    data = pad_sequences(data, maxlen=max_tokens)
    pred = model.predict(data)
    pred = np.argmax(pred,axis=1)
    
    return le.inverse_transform(pred)



In [None]:
inference(data)

# Conclusion / Overall Analysis

### Model Performance
Based on our model performance, the model is overfit to the training data so we need to evaluate the model.