# **News Category Classification using LSTM**
**News categories included in this dataset include business; science and technology; entertainment; and health.** 

**Different news articles that refer to the same news item (e.g., several articles about recently released employment statistics) are also categorized together.**

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import re
from tensorflow import keras
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.models import Sequential
from keras.utils.np_utils import to_categorical

In [None]:
#importing the dataset
dir = pd.read_csv('../input/news-aggregator-dataset/uci-news-aggregator.csv')
pd.set_option('display.max_columns', None)
dir.head()

**WE HAVE ONLY TWO FEATURES OF USE**

1. **TITLE**
2. **CATEGORY**

In [None]:
#creating a new dataset with only relevant features.
ds = dir[['TITLE','CATEGORY']]
ds.head()

**HERE YOU CAN SEE THAT ALL CATEGORIES ARE IN ORDER(ALL B's TOGETHER AND SO ON), THEREFORE SHUFFLING THEM FOR OUR CONVENIENCE**

In [None]:
#shuffling rows with the help of sample, here (frac = 1) means return all rows
ds = ds.sample(frac=1).reset_index(drop=True)
ds.head()

**DATASET IS NOW SHUFFLED**

In [None]:
#checking for null values
ds.isnull().sum()


**NO NULL VALUES FOUND**

In [None]:
#plotting graph for categories
sns.countplot(x = 'CATEGORY',data = ds)

**THERE ARE FOUR TYPES OF CATEGORIES-**
1. **b : business (~115000)**
2. **t : science and technology (~110000)**
3. **e : entertainment (~150000)**
4. **m : health (~40000)**


**NOW MOVING ONTO CLEANING AND PREPROCESSING OF THE TEXT DATA**

In [None]:
#cleaning and preprocessing the text

cleaned = []
for i in range(0,len(ds)):
    
    #removing any other words than (a-z) and (A-Z)
    msg = re.sub('[^a-zA-Z]',' ',ds['TITLE'][i])
    
    #converting all texts to lower case
    msg = msg.lower()
    
    #tokenizing
    msg = msg.split()
    
    #stemming and removing stopwords
    ps = PorterStemmer()
    msg = [ps.stem(words) for words in msg if not words in set(stopwords.words('english'))]
    msg = ' '.join(msg)
    cleaned.append(msg)

In [None]:
#cleaned data with no punctuations,stopwords and all texts in lowercase.
cleaned[:5]

In [None]:
#taking dictionary size 5000
dict_size = 5000

#one hot encoding
one_hot_mat = [one_hot(words,dict_size) for words in cleaned]

#now for input as an embedding layer length of all rows should be equal therefore applying padding
#this will make size of all rows equal by adding 0 at starting of the shorter rows
#size of each row will be equal to length of longest row.
embedded_layer = pad_sequences(one_hot_mat,padding = 'pre',maxlen = 150)
embedded_layer

In [None]:
#now creating independent and dependent features
x = embedded_layer
y = np.array(ds['CATEGORY'])

In [None]:
#converting categorical values of y using OneHotEncoding
le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y,4)

In [None]:
y[:10]

In [None]:
#splitting the Dataset into Train and Test set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

In [None]:
#creating model using LSTM
model = Sequential()

#taking number features as 50
model.add(Embedding(dict_size,50,input_length = len(x[0])))
model.add(Dropout(0.2))

#adding LSTM layers with 100 neurons
model.add(LSTM(100))

#adding output layer 
model.add(Dense(4,activation="softmax"))

#compiling the model
model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=["accuracy"])

#summary of model
model.summary()

#training the model
rnn = model.fit(x_train, y_train, validation_data = (x_test,y_test), epochs = 10, batch_size = 256)

In [None]:
#evaluating our model
model.evaluate(x_test,y_test)

In [None]:
#making predictions
pred = model.predict(x_test)

#saving index of maximum value of pred in preds (because in pred probabilities will come)
preds = []
for i in range(0,len(pred)):
    preds.append(pred[i].argmax())

#saving index of maximum value of y_test in actual
actual = []
for i in range(0,len(y_test)):
    actual.append(y_test[i].argmax())


In [None]:
#classification report
from sklearn import metrics
report = metrics.classification_report(actual, preds, target_names = ['b','t','e','m'])
print(report)

In [None]:
#checking category of a text
txt = ["An apple a day keeps doctor away."]

#cleaning and preprocessing the text
cleaned = []
for i in range(0,len(txt)):
    msg = re.sub('[^a-zA-Z]',' ',txt[i])
    msg = msg.lower()
    msg = msg.split()
    ps = PorterStemmer()
    msg = [ps.stem(words) for words in msg if not words in set(stopwords.words('english'))]
    msg = ' '.join(msg)
    cleaned.append(msg)

#one hot encoding and embedding layer
one_hot_mat = [one_hot(words,dict_size) for words in cleaned]
embedded_layer = pad_sequences(one_hot_mat,padding = 'pre',maxlen = 150)
embedded_layer

#prediction
pred = model.predict(embedded_layer)
cat = ['Business','Science','Entertainment','Health']
print(pred, cat[np.argmax(pred)])