In [1]:
import pandas as pd

In [2]:
# The class ids are numbered 1-4 where 1 represents World, 2 represents Sports, 
# 3 represents Business and 4 represents Sci/Tech.
# Dataset Source : https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset/
df = pd.read_csv("test.csv")
df.head(5)

Unnamed: 0,Class Index,Title,Description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [3]:
df.shape

(7600, 3)

In [4]:
df.dropna(inplace=True)

In [5]:
df.shape

(7600, 3)

In [6]:
df.isnull().sum()

Class Index    0
Title          0
Description    0
dtype: int64

In [7]:
df["final_text"] = df["Title"] + " " + df["Description"]
df.head()

Unnamed: 0,Class Index,Title,Description,final_text
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...,Fears for T N pension after talks Unions repre...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o...",The Race is On: Second Private Team Sets Launc...
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...,Ky. Company Wins Grant to Study Peptides (AP) ...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...,Prediction Unit Helps Forecast Wildfires (AP) ...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...,Calif. Aims to Limit Farm-Related Smog (AP) AP...


In [8]:
new_df = df.drop(['Title', 'Description'], axis=1)

In [9]:
new_df

Unnamed: 0,Class Index,final_text
0,3,Fears for T N pension after talks Unions repre...
1,4,The Race is On: Second Private Team Sets Launc...
2,4,Ky. Company Wins Grant to Study Peptides (AP) ...
3,4,Prediction Unit Helps Forecast Wildfires (AP) ...
4,4,Calif. Aims to Limit Farm-Related Smog (AP) AP...
...,...,...
7595,1,Around the world Ukrainian presidential candid...
7596,2,Void is filled with Clement With the supply of...
7597,2,Martinez leaves bitter Like Roger Clemens did ...
7598,3,5 of arthritis patients in Singapore take Bext...


# Preprocessing

In [10]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

nltk.download('punkt')
nltk.download('stopwords')


def preprocess_text(text):
     # Lowercasing
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove multiple spaces
    text = re.sub(r'\s+',' ',text)
    
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]',' ',text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing Punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    
    # Removing Stop Words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(stemmed_tokens)
    
    return preprocessed_text


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
new_df["preprocessed_text"] = new_df["final_text"].apply(preprocess_text)

In [12]:
new_df

Unnamed: 0,Class Index,final_text,preprocessed_text
0,3,Fears for T N pension after talks Unions repre...,fear n pension talk union repres worker turner...
1,4,The Race is On: Second Private Team Sets Launc...,race second privat team set launch date human ...
2,4,Ky. Company Wins Grant to Study Peptides (AP) ...,ky compani win grant studi peptid ap ap compan...
3,4,Prediction Unit Helps Forecast Wildfires (AP) ...,predict unit help forecast wildfir ap ap bare ...
4,4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,calif aim limit farm relat smog ap ap southern...
...,...,...,...
7595,1,Around the world Ukrainian presidential candid...,around world ukrainian presidenti candid vikto...
7596,2,Void is filled with Clement With the supply of...,void fill clement suppli attract pitch option ...
7597,2,Martinez leaves bitter Like Roger Clemens did ...,martinez leav bitter like roger clemen almost ...
7598,3,5 of arthritis patients in Singapore take Bext...,arthriti patient singapor take bextra celebrex...


In [15]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load your text data as a DataFrame, where 'text' is the column containing text and 'label' is the target class.
# Example:
# data = pd.read_csv('your_data.csv')

# Tokenize and train a Word2Vec model
sentences = new_df['preprocessed_text'].apply(lambda x: x.split())
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)

# Preprocess text data
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(new_df['preprocessed_text'])
X = tokenizer.texts_to_sequences(new_df['preprocessed_text'])
X = pad_sequences(X, maxlen=max_sequence_length)
X

array([[    0,     0,     0, ...,   248,   139,  4465],
       [    0,     0,     0, ...,   938,   261,   780],
       [    0,     0,     0, ...,   558,   781,  5795],
       ...,
       [    0,     0,     0, ...,   207,  2267,   104],
       [    0,     0,     0, ...,  5593,   600, 14564],
       [    0,     0,     0, ...,  3728, 14565,   141]])

In [16]:
X.shape

(7600, 100)

In [17]:
# Encode labels
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(new_df['Class Index'])
Y

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([2, 3, 3, ..., 1, 2, 2], dtype=int64)

In [21]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(6080, 100) (6080,)
(1520, 100) (1520,)


In [28]:
print(len(label_encoder.classes_))

4


In [29]:
# Build a simple neural network model
model = keras.Sequential()
model.add(keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(keras.layers.LSTM(100))
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(len(label_encoder.classes_), activation='softmax'))

In [30]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          1456600   
                                                                 
 lstm_2 (LSTM)               (None, 100)               80400     
                                                                 
 dense_4 (Dense)             (None, 64)                6464      
                                                                 
 dense_5 (Dense)             (None, 4)                 260       
                                                                 
Total params: 1543724 (5.89 MB)
Trainable params: 1543724 (5.89 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [33]:
# Train the model
from keras.callbacks import EarlyStopping
history = model.fit(X_train, Y_train, batch_size=128, epochs=10, verbose=1, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
# model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [34]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, Y_test)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.8657894730567932


In [35]:
text_data = "Scientists Discover Breakthrough Method for Efficiently Storing Renewable Energy Tech Giant Unveils Cutting-Edge Quantum Computing Solution for Complex Problems"

In [45]:
sequences = tokenizer.texts_to_sequences(text_data)
X = pad_sequences(sequences, maxlen=model.input_shape[1])

# Make predictions
predictions = model.predict(X)

# Decode label-encoded predictions
decoded_predictions = label_encoder.inverse_transform(predictions.argmax(axis=1))
type(decoded_predictions)
print(decoded_predictions)
mode = np.argmax(np.bincount(decoded_predictions))
if mode == 1:
    print("World")
elif mode == 2:
    print("Sports")
elif mode == 3:
    print("Business")
else:
    print("Sci/Tech")

[4 1 4 4 1 4 4 4 4 4 4 4 4 4 1 4 1 4 4 4 4 4 4 4 4 4 4 4 4 1 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 1 4 4 1 4 4 4 4 4 4 4 4 4 1 4 4 4 4 1 4 1 4 4 4 4 4
 4 1 4 4 4 4 4 4 4 1 4 4 4 4 4 1 4 4 1 1 1 4 4 4 4 4 1 1 4 4 4 1 4 4 4 4 4
 4 4 4 1 4 1 4 1 4 4 1 4 4 1 1 4 4 1 4 4 4 4 4 1 4 4 4 1 4 4 4 4 4 1 4 4 1
 4 4 4 4 1 4 4 4 4 4 4 4]
Sci/Tech
