In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from textblob import TextBlob




In [2]:
# b = TextBlob("I havv goood speling!")
# print(b.correct())

In [3]:
# from textblob import TextBlob
# from textblob.sentiments import NaiveBayesAnalyzer

# texts = "I like this moviess on bgmi" 
# blob = TextBlob(texts, analyzer=NaiveBayesAnalyzer())

# if list(blob.sentiment)[0] =='neg':
#     print("Negative")
# else:
#     print("Positive")


In [4]:
df = pd.read_csv(r"C:\Users\Rohan\Pictures\rohan\NLP\News.csv")
df.shape

(101527, 4)

In [5]:
df.head()

Unnamed: 0,ID,News Category,Title,Summary
0,N88753,lifestyle,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N45436,news,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...
2,N23144,health,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
3,N86255,health,Dispose of unwanted prescription drugs during ...,
4,N93187,news,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...


In [6]:
df.drop(columns=['ID','Summary'],inplace=True)

In [7]:
df.rename(columns={'Title': 'Summary'},inplace=True)

In [8]:
df.rename(columns={'News Category': 'Category'},inplace=True)

In [9]:
df.head(1)

Unnamed: 0,Category,Summary
0,lifestyle,"The Brands Queen Elizabeth, Prince Charles, an..."


In [10]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [11]:
categories_to_select = ['finance', 'travel', 'video', 'lifestyle', 'foodanddrink', 'weather', 'autos', 'health']

df= df[df['Category'].isin(categories_to_select)]

df.shape

(34683, 2)

In [12]:
texts = df['Summary'].astype(str)
labels = df['Category'].astype(str)

In [13]:
df.isnull().sum()

Category    0
Summary     0
dtype: int64

In [14]:
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)

699


In [15]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Tokenize
    words = word_tokenize(text)
    
    # Convert to lower case
    words = [word.lower() for word in words]
    
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]
    
    return ' '.join(words)

# Apply text cleaning
texts_cleaned = texts.apply(clean_text)


In [16]:
# Initialize Label Encoder
label_encoder = LabelEncoder()

# Fit and transform labels
labels_encoded = label_encoder.fit_transform(labels)

# Initialize Tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the cleaned texts
tokenizer.fit_on_texts(texts_cleaned)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts_cleaned)

# Pad sequences to have the same length
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')


In [17]:
max_len

31

In [32]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels_encoded, test_size=0.2, random_state=42)

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 256
rnn_units = 128

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    SimpleRNN(rnn_units, return_sequences=False),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 31, 256)           5482752   
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               49280     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               16512     
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 8)                 1032      
                                                                 
Total params: 5549576 (21.17 MB)
Trainable params: 554

In [27]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=20,
    validation_data=(X_test, y_test),
    batch_size=32,
    callbacks=[early_stopping]
)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [28]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

Test Accuracy: 0.57


In [29]:
def predict_category(text):
    # Clean the input text
    cleaned_text = clean_text(text)
    
    # Convert to sequence and pad
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    
    # Predict category
    prediction = model.predict(padded_sequence)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])[0]
    
    return predicted_label




In [30]:
model.save('text_classification_rnn.h5')

import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)


  saving_api.save_model(


In [31]:
user_input = input("Enter text for prediction: ")
predicted_category = predict_category(user_input)
print(f"Predicted Category: {predicted_category}")

Predicted Category: weather


In [36]:
df['Category'].iloc[2686]

'lifestyle'

In [37]:
df['Summary'].iloc[2686]

'Can You Spot the One Image That Is Not Like the Others?'

In [26]:
print("Encoded Labels:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

Encoded Labels:
0: autos
1: finance
2: foodanddrink
3: health
4: lifestyle
5: travel
6: video
7: weather
