In [7]:
import pandas as pd 
import numpy as np 
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [2]:
text_df = pd.read_csv("./df_file.csv")

X = text_df['Text'].tolist()
y = text_df['Label'].tolist()
# X = text_df.loc[:, text_df.columns != "Label"]
# y = text_df["Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
# Tokenize the text (fit on training data only)
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [8]:
# Pad sequences to have consistent length
max_len = 100  # You can adjust this based on your dataset
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [14]:
# Convert labels to NumPy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

In [21]:
# Build the CNN model
embedding_dim = 50  # You can adjust this based on your dataset
vocab_size = min(max_words, len(tokenizer.word_index) + 1)

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(5, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 50)           500000    
                                                                 
 conv1d_1 (Conv1D)           (None, 96, 128)           32128     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 5)                 325       
                                                                 
Total params: 540,709
Trainable params: 540,709
Non-trainable params: 0
________________________________________________

In [22]:
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=128, validation_split=0.2)

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x198ceba1c60>

In [23]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy}')

# Make predictions on new data
predictions = model.predict(X_test)
predicted_labels = np.argmax(predictions, axis=1)

# Print classification report
print(classification_report(y_test, predicted_labels))

Test accuracy: 0.9033707976341248
              precision    recall  f1-score   support

           0       0.90      0.92      0.91        84
           1       0.88      0.99      0.93       102
           2       0.91      0.85      0.88        80
           3       0.96      0.88      0.92        77
           4       0.90      0.86      0.88       102

    accuracy                           0.90       445
   macro avg       0.91      0.90      0.90       445
weighted avg       0.90      0.90      0.90       445

