# Library

In [1]:
# base library
import pandas as pd
import numpy as np
import tensorflow as tf

# text preprocessing purpose
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# model training and evaluation purpose
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, GRU, Dropout, Reshape
from keras.optimizers import Adam

# load model
from tensorflow.keras.models import load_model

import warnings
warnings.filterwarnings('ignore')




# Load Model

In [2]:
model = load_model('model.keras')






# Create Dummy Data

In [3]:
headlines = [
    "Tech Giants Unveil New Collaboration for Sustainable Energy Solutions",
    "Stock Market Surges to Record Highs Amidst Economic Recovery Optimism",
    "Health Experts Warn of Potential Surge in Flu Cases Amidst COVID-19 Pandemic",
    "Artificial Intelligence Innovations Revolutionize Healthcare Diagnostics",
    "Renowned Author Releases Highly Anticipated Novel on Climate Change"
]
summaries = [
    "Leading technology companies announced a joint initiative aimed at developing innovative solutions to address global energy challenges, emphasizing sustainability and renewable sources.",
    "Investor optimism drives stock market indices to new highs as positive economic indicators fuel hopes of a swift recovery from recent downturns.",
    "Public health officials express concerns over the possibility of an increase in influenza infections alongside the ongoing COVID-19 crisis, urging enhanced preventive measures and vaccination campaigns.",
    "Breakthroughs in AI technology reshape the landscape of medical diagnostics, offering faster and more accurate assessments while paving the way for personalized treatment approaches.",
    "Acclaimed author launches a thought-provoking literary work that explores the complex interplay between humanity and the environment, shedding light on pressing issues of climate change and sustainability."
]

dummy_df = pd.DataFrame({
    "headlines": headlines,
    "summaries": summaries
})

### Concat Dataframe

In [4]:
dummy_new = pd.DataFrame()
dummy_new['text'] = dummy_df['headlines'] + ' ' + dummy_df['summaries']
dummy_new

Unnamed: 0,text
0,Tech Giants Unveil New Collaboration for Susta...
1,Stock Market Surges to Record Highs Amidst Eco...
2,Health Experts Warn of Potential Surge in Flu ...
3,Artificial Intelligence Innovations Revolution...
4,Renowned Author Releases Highly Anticipated No...


# Preprocessing

In [5]:
# initialize necessary packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# set stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# define function to preprocess text
def preprocess_text(text):
    # Case folding
    words = text.lower()
    # Mention removal
    words = re.sub("@[A-Za-z0-9_]+", " ", text)
    # Hashtags removal
    words = re.sub("#[A-Za-z0-9_]+", " ", text)
    # Newline removal (\n)
    words = re.sub(r"\\n", " ",text)
    # Whitespace removal
    words = text.strip()
    # URL removal
    words = re.sub(r"http\S+", " ", text)
    words = re.sub(r"www.\S+", " ", text)
    # Non-letter removal (such as emoticon, symbol (like μ, $, 兀), etc
    words = re.sub("[^A-Za-z\s']", " ", text)

    # tokenize text
    tokens = word_tokenize(text)
    # remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # lemmatize words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # join the words back into a single string
    return ' '.join(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CIEL_W\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CIEL_W\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CIEL_W\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
dummy_new['text'] = dummy_new['text'].apply(preprocess_text)

# Inference

WELLNESS:0, POLITICS:1, ENTERTAINMENT:2, TRAVEL:3, STYLE & BEAUTY:4, PARENTING:5, FOOD & DRINK:6, WORLD NEWS:7, BUSINESS:8, SPORTS:9

In [7]:
# Perform prediction
predictions = model.predict(dummy_new)

# Get the index of the class with the highest probability for each prediction
predicted_classes = np.argmax(predictions, axis=1)

categories_dict = {0:"WELLNESS", 1:"POLITICS", 2:"ENTERTAINMENT", 3:"TRAVEL", 4:"STYLE & BEAUTY", 5:"PARENTING", 6:"FOOD & DRINK", 7:"WORLD NEWS", 8:"BUSINESS", 9:"SPORTS"}

predicted_categories = [categories_dict[idx] for idx in predicted_classes]
print(predicted_categories)

['BUSINESS', 'BUSINESS', 'WELLNESS', 'WELLNESS', 'ENTERTAINMENT']


### Display Result

In [8]:
dummy_new['predicted_category'] = predicted_categories
dummy_new

Unnamed: 0,text,predicted_category
0,Tech Giants Unveil New Collaboration Sustainab...,BUSINESS
1,Stock Market Surges Record Highs Amidst Econom...,BUSINESS
2,Health Experts Warn Potential Surge Flu Cases ...,WELLNESS
3,Artificial Intelligence Innovations Revolution...,WELLNESS
4,Renowned Author Releases Highly Anticipated No...,ENTERTAINMENT


We can see our model can categorize unseen data just fine. news 0-2 and 4 are given suitable label while news 3 seems a bit off but let's check what it says

In [9]:
dummy_new['text'][3]

'Artificial Intelligence Innovations Revolutionize Healthcare Diagnostics Breakthroughs AI technology reshape landscape medical diagnostics , offering faster accurate assessment paving way personalized treatment approach .'

wow turns out text 3 also talks about health (wellness) instead of just AI, so the predictions isn't actually that bad