In [2]:
import pandas as pd 

# CNNmodel code 


In [12]:
df = pd.read_csv('/kaggle/input/political-headline/Political_Headline.csv') 


In [13]:
df = df[['headline','category']]

In [14]:
df.columns 

Index(['headline', 'category'], dtype='object')

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.utils import to_categorical




In [16]:
# Convert to string and handle missing values
df['headline'] = df['headline'].astype(str)  # Convert all values in the 'headline' column to string
df['headline'] = df['headline'].fillna('') 

In [17]:
# Preprocessing
X = df['headline'].values
y = df['category'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded, num_classes=3)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)



In [18]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)



In [19]:
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Build CNN model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))





In [20]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])




In [21]:
# Train the model
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))



Epoch 1/10
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 25ms/step - accuracy: 0.7182 - loss: 0.6564 - val_accuracy: 0.8723 - val_loss: 0.3473
Epoch 2/10
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 25ms/step - accuracy: 0.9110 - loss: 0.2564 - val_accuracy: 0.8703 - val_loss: 0.3624
Epoch 3/10
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 25ms/step - accuracy: 0.9397 - loss: 0.1823 - val_accuracy: 0.8671 - val_loss: 0.3936
Epoch 4/10
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 25ms/step - accuracy: 0.9545 - loss: 0.1291 - val_accuracy: 0.8638 - val_loss: 0.4702
Epoch 5/10
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 25ms/step - accuracy: 0.9694 - loss: 0.0921 - val_accuracy: 0.8617 - val_loss: 0.5678
Epoch 6/10
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 26ms/step - accuracy: 0.9752 - loss: 0.0710 - val_accuracy: 0.8607 - val_loss: 0.6480
Epoch 7/10
[1m8

In [22]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8482 - loss: 0.9172
Test Accuracy: 0.8542


In [31]:
import numpy as np

# Function to predict the category of a random Bangla sentence
def predict_category(sentence, model, tokenizer, label_encoder, max_len=100):
    # Preprocess the input sentence
    sentence_seq = tokenizer.texts_to_sequences([sentence])  # Convert sentence to sequence
    sentence_pad = pad_sequences(sentence_seq, maxlen=max_len)  # Pad the sequence

    # Make prediction
    prediction = model.predict(sentence_pad)

    # Get the index of the category with the highest probability
    predicted_class_index = np.argmax(prediction, axis=1)

    # Convert the index back to the original category label
    predicted_category = label_encoder.inverse_transform(predicted_class_index)

    return predicted_category[0]

# Example usage:
random_sentence = "পুরুষদের আত্মহত্যা বাড়ার জন্য নারীরা দায়ী: দক্ষিণ কোরীয় রাজনীতিবিদ।"  # A random Bangla sentence
predicted_category = predict_category(random_sentence, model, tokenizer, label_encoder)
print(f"The predicted category for the sentence is: {predicted_category}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
The predicted category for the sentence is: international_politics


# CNN+RNN hybrid model 

In [32]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, GRU, Dense, Dropout, Flatten
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical





In [33]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary size
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=100)  # Padding the sequences to the same length


In [34]:

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)  # Convert to one-hot encoding

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)



In [35]:
# CNN + RNN (LSTM or GRU) Model
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))

# Convolutional Layer
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# RNN Layer (LSTM or GRU)
model.add(LSTM(128))  # Use GRU(128) for GRU model instead of LSTM

# Fully connected Layer
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

# Output Layer
model.add(Dense(y_categorical.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
model.summary()





In [36]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))



Epoch 1/10
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 118ms/step - accuracy: 0.7045 - loss: 0.6692 - val_accuracy: 0.8680 - val_loss: 0.3587
Epoch 2/10
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 116ms/step - accuracy: 0.9088 - loss: 0.2672 - val_accuracy: 0.8742 - val_loss: 0.3474
Epoch 3/10
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 118ms/step - accuracy: 0.9364 - loss: 0.1893 - val_accuracy: 0.8701 - val_loss: 0.3698
Epoch 4/10
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 117ms/step - accuracy: 0.9580 - loss: 0.1323 - val_accuracy: 0.8609 - val_loss: 0.5556
Epoch 5/10
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 118ms/step - accuracy: 0.9725 - loss: 0.0854 - val_accuracy: 0.8563 - val_loss: 0.5904
Epoch 6/10
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 116ms/step - accuracy: 0.9816 - loss: 0.0552 - val_accuracy: 0.8620 - val_loss: 0.6940
Epoch 7/10

In [37]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_accuracy*100:.2f}%')



[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.8466 - loss: 0.9513
Test accuracy: 85.38%


In [38]:
# Predicting on a random input sentence
def predict_category(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=100)
    prediction = model.predict(padded_sequence)
    category = label_encoder.inverse_transform([np.argmax(prediction)])
    return category[0]

# Example prediction
random_bangla_sentence = "বাংলাদেশের রাজনীতি অনেক জটিল।"
predicted_category = predict_category(random_bangla_sentence)
print(f'Predicted category: {predicted_category}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step
Predicted category: bd_politics
