# Meme Persuation Technique Classifier

# Model Training Method 2: LSTM

In [30]:
import numpy as np
import pandas as pd
import json
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

# Your MultilabelBalancedRandomSampler class definition goes here

# Function to load data from JSON file
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Load train, validation, and dev data
train_data = load_data("/kaggle/input/dataset/train.json")
val_data = load_data("/kaggle/input/dataset/validation.json")
dev_data = load_data("/kaggle/input/dataset/dev_unlabeled.json")

# Create DataFrames
train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)
dev_df = pd.DataFrame(dev_data)

# Process labels using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
binary_labels_train = mlb.fit_transform(train_df["labels"])
binary_labels_val = mlb.transform(val_df["labels"])

# Preprocess text data
max_words = 10000
max_len = 100



In [31]:
import random
import numpy as np

from torch.utils.data.sampler import Sampler


class MultilabelBalancedRandomSampler(Sampler):
    """
    MultilabelBalancedRandomSampler: This sampler operates on a multilabel dataset
    comprising n_samples and n_classes. It selects samples from the data with equal
    probability per class, thereby simultaneously oversampling minority classes and
    undersampling majority classes. It is important to note that while using this 
    sampler does not ensure a uniform distribution of classes in the output samples, 
    it does guarantee that each class will have at least batch_size / n_classes samples
    as batch_size approaches infinity.
    """

    def __init__(self, labels, indices=None, class_choice="least_sampled"):
        self.labels = labels
        self.indices = indices
        if self.indices is None:
            self.indices = range(len(labels))

        self.num_classes = self.labels.shape[1]

        # List of lists of example indices per class
        self.class_indices = []
        for class_ in range(self.num_classes):
            lst = np.where(self.labels[:, class_] == 1)[0]
            lst = lst[np.isin(lst, self.indices)]
            self.class_indices.append(lst)

        self.counts = [0] * self.num_classes

        assert class_choice in ["least_sampled", "random", "cycle"]
        self.class_choice = class_choice
        self.current_class = 0

    def __iter__(self):
        self.count = 0
        return self

    def __next__(self):
        if self.count >= len(self.indices):
            raise StopIteration
        self.count += 1
        return self.sample()

    def sample(self):
        class_ = self.get_class()
        class_indices = self.class_indices[class_]
        chosen_index = np.random.choice(class_indices)
        if self.class_choice == "least_sampled":
            for class_, indicator in enumerate(self.labels[chosen_index]):
                if indicator == 1:
                    self.counts[class_] += 1
        return chosen_index

    def get_class(self):
        if self.class_choice == "random":
            class_ = random.randint(0, self.labels.shape[1] - 1)
        elif self.class_choice == "cycle":
            class_ = self.current_class
            self.current_class = (self.current_class + 1) % self.labels.shape[1]
        elif self.class_choice == "least_sampled":
            min_count = self.counts[0]
            min_classes = [0]
            for class_ in range(1, self.num_classes):
                if self.counts[class_] < min_count:
                    min_count = self.counts[class_]
                    min_classes = [class_]
                if self.counts[class_] == min_count:
                    min_classes.append(class_)
            class_ = np.random.choice(min_classes)
        return class_

    def __len__(self):
        return len(self.indices)

In [32]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df["text"])
X_train = tokenizer.texts_to_sequences(train_df["text"])
X_val = tokenizer.texts_to_sequences(val_df["text"])

X_train = pad_sequences(X_train, maxlen=max_len)
X_val = pad_sequences(X_val, maxlen=max_len)

# Create a tf.data.Dataset using MultilabelBalancedRandomSampler
def generator():
    for sample in zip(X_train, binary_labels_train):
        yield sample

batch_size = 50
labels_train = binary_labels_train
sampler = MultilabelBalancedRandomSampler(labels_train)

train_dataset = tf.data.Dataset.from_generator(generator, output_signature=(
    tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
    tf.TensorSpec(shape=(len(mlb.classes_),), dtype=tf.float32)
))

train_dataset = train_dataset.shuffle(len(X_train)).batch(batch_size, drop_remainder=True)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)


from tensorflow.keras.layers import Dropout

# Build LSTM model with modifications
embedding_dim = 50

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(Bidirectional(LSTM(128, return_sequences=True)))  # Increase the number of LSTM units
model.add(Dropout(0.5))  # Add dropout for regularization
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))  # Add dropout for regularization
model.add(Dense(len(mlb.classes_), activation='sigmoid'))

# Compile the model
optimizer = Adam(learning_rate=1e-3)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using the balanced dataset
epochs = 20

history = model.fit(train_dataset, validation_data=(X_val, binary_labels_val), epochs=epochs, batch_size=batch_size)
# Evaluate the model on the validation set
val_preds = model.predict(X_val)
val_preds_binary = (val_preds > 0.5).astype(int)

val_accuracy = accuracy_score(binary_labels_val, val_preds_binary)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [35]:
print("Classification Report on Validation Set:")
print(classification_report(binary_labels_val, val_preds_binary, target_names=mlb.classes_))

Classification Report on Validation Set:
                                                     precision    recall  f1-score   support

                                Appeal to authority       0.50      0.29      0.36        63
                           Appeal to fear/prejudice       0.00      0.00      0.00        27
                                          Bandwagon       0.00      0.00      0.00         7
               Black-and-white Fallacy/Dictatorship       0.12      0.04      0.06        53
                          Causal Oversimplification       0.00      0.00      0.00        21
                                              Doubt       0.10      0.04      0.06        24
                          Exaggeration/Minimisation       0.00      0.00      0.00        27
                                        Flag-waving       0.56      0.12      0.20        42
                   Glittering generalities (Virtue)       0.33      0.11      0.17        36
                            

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
