In [2]:
import tensorflow as tf
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('/content/Bhaav-Dataset.csv')
X = data['Sentences'].values
y = data['Annotation'].values

# Split the dataset into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define a tokenizer and convert the text data to sequences
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
maxlen = 100
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen, padding='post', truncating='post')
X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val, maxlen=maxlen, padding='post', truncating='post')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen, padding='post', truncating='post')

# Define the model
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(10000, 32, input_length=maxlen),
  tf.keras.layers.Conv1D(64, 5, activation='relu'),
  tf.keras.layers.GlobalMaxPooling1D(),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dense(5, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred.argmax(axis=1))
print(cm)

# Visualize misclassified examples
misclassified_idxs = np.where(y_pred.argmax(axis=1) != y_test)[0]
misclassified_texts = X_test[misclassified_idxs]
misclassified_labels = y_test[misclassified_idxs]
misclassified_preds = y_pred[misclassified_idxs]

for text, true_label, pred in zip(misclassified_texts, misclassified_labels, misclassified_preds):
    print('Text:', tokenizer.sequences_to_texts([text])[0])
    print('True label:', true_label)
    print('Predicted label:', np.argmax(pred))
    print()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Text: माता पिता की बातों की उपेक्षा कर के सवेरे ही घर से निकल जाता और अपनी ही तरह के आवारा लड़कों के साथ दिन भर खेलता रहता <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
True label: 0
Predicted label: 4

Text: गेंद उसे रास्ता <OOV> जा रही थी <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <O

In [None]:
num_misclassified = len(misclassified_idxs)

print('Total number of misclassified labels:', num_misclassified)

Total number of misclassified labels: 1978


In [4]:

# Print confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[  62   24   45   14  176]
 [  29  133   70   25  251]
 [  39   29  216   37  247]
 [  16   24   46   59  164]
 [ 110  189  315  142 1599]]


In [None]:
misclassified_per_label = {}
for i in range(cm.shape[0]):
    total = sum(cm[i,:])
    correct = cm[i,i]
    misclassified = total - correct
    misclassified_per_label[i] = misclassified

# Print the number of misclassified instances for each label
print('Misclassified instances per label:', misclassified_per_label)

Misclassified instances per label: {0: 272, 1: 354, 2: 344, 3: 252, 4: 756}


In [None]:
print('Test loss:', loss)
print('Test accuracy:', accuracy)

Test loss: 2.6618638038635254
Test accuracy: 0.5129278302192688


In [None]:
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print the classification report
report = classification_report(y_test, y_pred.argmax(axis=1))
print(report)

              precision    recall  f1-score   support

           0       0.23      0.15      0.18       321
           1       0.32      0.30      0.31       508
           2       0.29      0.39      0.34       568
           3       0.24      0.18      0.21       309
           4       0.67      0.68      0.68      2355

    accuracy                           0.51      4061
   macro avg       0.35      0.34      0.34      4061
weighted avg       0.51      0.51      0.51      4061



In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.callbacks import LearningRateScheduler
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv("/content/Bhaav-Dataset.csv", encoding="utf-8")
texts = data["Sentences"].values
labels = data["Annotation"].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize the texts
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Convert the labels to one-hot encoding
num_classes = len(np.unique(labels))
y_train_onehot = np.eye(num_classes)[y_train]
y_test_onehot = np.eye(num_classes)[y_test]

# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define the learning rate schedule
def lr_schedule(epoch):
    lr = 0.001
    if epoch > 5:
        lr *= 0.1
    return lr

# Define the callbacks
lr_scheduler = LearningRateScheduler(lr_schedule)

# Train the model with dropout regularization and learning rate scheduling
history = model.fit(X_train_pad, y_train_onehot, epochs=10, batch_size=32, validation_split=0.2, callbacks=[lr_scheduler])

# Evaluate the model on the test set
score = model.evaluate(X_test_pad, y_test_onehot)
print("Test score: %f" % score[1])

# Print the classification report
y_pred = model.predict(X_test_pad)
y_pred_class = np.argmax(y_pred, axis=1)
y_true_class = np.argmax(y_test_onehot, axis=1)
print(classification_report(y_true_class, y_pred_class))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.531150
              precision    recall  f1-score   support

           0       0.23      0.07      0.11       321
           1       0.33      0.23      0.27       508
           2       0.27      0.31      0.29       568
           3       0.23      0.15      0.18       309
           4       0.65      0.76      0.70      2355

    accuracy                           0.53      4061
   macro avg       0.34      0.30      0.31      4061
weighted avg       0.49      0.53      0.50      4061



In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocess text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['text'])
X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Encode labels
y_train = pd.get_dummies(train_df['label']).values
y_test = pd.get_dummies(test_df['label']).values

# Define model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=maxlen))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(8, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
es = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
history = model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32, callbacks=[es])

# Evaluate model
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)
print(classification_report(y_test, y_pred))

# Fine-tune model
embedding_matrix = np.random.rand(5000, 32)
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = True
history = model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32, callbacks=[es])

# Evaluate fine-tuned model
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)
print(classification_report(y_test, y_pred))
