In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Embedding
from tensorflow.keras.callbacks import EarlyStopping
import os
import zipfile

In [8]:
# Load the dataset
file_path = "/content/AMMUSED (1).csv"
dataset = pd.read_csv(file_path)



In [9]:
# Focus on relevant columns
comments = dataset['comment'].astype(str)
labels = dataset['label_y']

# Preprocessing parameters
max_vocab_size = 20000
max_sequence_length = 100

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Embedding
from tensorflow.keras.callbacks import EarlyStopping
# Import Tokenizer from tensorflow.keras.preprocessing.text and pad_sequences from tensorflow.keras.utils
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences # Import pad_sequences from tensorflow.keras.utils
import os
import zipfile

In [10]:
# Tokenize and pad the text data
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, encoded_labels, test_size=0.2, random_state=42
)


In [11]:
# Build the CNN+LSTM model
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),

    # Convolutional layer to extract local features
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=4),

    # LSTM layer to capture sequential dependencies
    LSTM(units=128, return_sequences=False),
    Dropout(0.3),

    # Fully connected layers
    Dense(units=64, activation='relu'),
    Dropout(0.3),
    Dense(units=len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])




In [12]:
# Train the model
epochs = 10
batch_size = 64
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=epochs,
    verbose=1
)

Epoch 1/10
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 132ms/step - accuracy: 0.4843 - loss: 1.1743 - val_accuracy: 0.5809 - val_loss: 1.0157
Epoch 2/10
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 127ms/step - accuracy: 0.6552 - loss: 0.8792 - val_accuracy: 0.6219 - val_loss: 0.9250
Epoch 3/10
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 129ms/step - accuracy: 0.7754 - loss: 0.6134 - val_accuracy: 0.6007 - val_loss: 1.1044
Epoch 4/10
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 139ms/step - accuracy: 0.8543 - loss: 0.4088 - val_accuracy: 0.5971 - val_loss: 1.1542
Epoch 5/10
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 129ms/step - accuracy: 0.9001 - loss: 0.2880 - val_accuracy: 0.5832 - val_loss: 1.4907
Epoch 6/10
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 128ms/step - accuracy: 0.9179 - loss: 0.2312 - val_accuracy: 0.5750 - val_loss: 1.6035
Epoch 7/10

In [13]:
# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.5752 - loss: 2.4322
Validation Loss: 2.4346
Validation Accuracy: 0.5760


In [15]:
# Save the trained model for later use
model_dir = "stance_detection_cnn_lstm"
# Add the .keras extension to the file path
model.save(model_dir + ".keras")

# Save the model in a zip file
zip_file_name = f"{model_dir}.zip"
with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(model_dir + ".keras"): # Update the path here too
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, model_dir + ".keras") # Update the path here too
            zipf.write(file_path, arcname)

print(f"Model successfully saved and zipped as {zip_file_name}")

Model successfully saved and zipped as stance_detection_cnn_lstm.zip


In [16]:
# Additional epochs with the same model for fine-tuning
fine_tuning_epochs = 5
fine_tune_history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=fine_tuning_epochs,
    verbose=1
)



Epoch 1/5
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 124ms/step - accuracy: 0.9481 - loss: 0.1245 - val_accuracy: 0.5664 - val_loss: 2.5803
Epoch 2/5
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 127ms/step - accuracy: 0.9495 - loss: 0.1215 - val_accuracy: 0.5809 - val_loss: 2.6077
Epoch 3/5
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 125ms/step - accuracy: 0.9482 - loss: 0.1140 - val_accuracy: 0.5713 - val_loss: 2.6909
Epoch 4/5
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 132ms/step - accuracy: 0.9465 - loss: 0.1201 - val_accuracy: 0.5733 - val_loss: 2.9276
Epoch 5/5
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 128ms/step - accuracy: 0.9508 - loss: 0.1108 - val_accuracy: 0.5752 - val_loss: 3.3944


ValueError: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=stance_detection_cnn_lstm_finetuned.

In [17]:
# Save the fine-tuned model
fine_tuned_model_dir = "stance_detection_cnn_lstm_finetuned"
model.save(fine_tuned_model_dir + ".keras")  # Add the .keras extension here

fine_tuned_zip_file = f"{fine_tuned_model_dir}.zip"
with zipfile.ZipFile(fine_tuned_zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(fine_tuned_model_dir):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, fine_tuned_model_dir)
            zipf.write(file_path, arcname)

print(f"Fine-tuned model successfully saved and zipped as {fine_tuned_zip_file}")

Fine-tuned model successfully saved and zipped as stance_detection_cnn_lstm_finetuned.zip


In [18]:
# Make predictions on new data
new_comments = ["I completely agree!", "I don't think this is correct.", "Can you clarify more?"]
new_sequences = tokenizer.texts_to_sequences(new_comments)
new_padded = pad_sequences(new_sequences, maxlen=max_sequence_length)

predictions = model.predict(new_padded)
predicted_labels = label_encoder.inverse_transform(predictions.argmax(axis=1))

for comment, label in zip(new_comments, predicted_labels):
    print(f"Comment: {comment} --> Predicted Label: {label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step
Comment: I completely agree! --> Predicted Label: agree
Comment: I don't think this is correct. --> Predicted Label: query
Comment: Can you clarify more? --> Predicted Label: comment
