In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import os
import zipfile

In [2]:
# Load the dataset
file_path = "/content/AMMUSED (1).csv"
dataset = pd.read_csv(file_path)

In [3]:
# Focus on relevant columns
comments = dataset['comment'].astype(str)
labels = dataset['label_y']

In [4]:
# Preprocessing parameters
max_vocab_size = 20000
max_sequence_length = 100

In [5]:
# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)


In [6]:
# Tokenize and pad the text data
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

In [7]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, encoded_labels, test_size=0.2, random_state=42
)

In [12]:
# Build the LSTM model
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(units=128, return_sequences=False),
    Dropout(0.3),
    Dense(units=64, activation='relu'),
    Dropout(0.3),
    Dense(units=len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
epochs = 3
batch_size = 64
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=epochs,
    verbose=1
)


Epoch 1/3




[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 265ms/step - accuracy: 0.4891 - loss: 1.1551 - val_accuracy: 0.6016 - val_loss: 0.9467
Epoch 2/3
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 276ms/step - accuracy: 0.6712 - loss: 0.8144 - val_accuracy: 0.6158 - val_loss: 0.8963
Epoch 3/3
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 269ms/step - accuracy: 0.7739 - loss: 0.5816 - val_accuracy: 0.6046 - val_loss: 0.9971


In [13]:
# Train the model
epochs = 2
batch_size = 32
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=epochs,
    verbose=1
)

Epoch 1/2
[1m795/795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 200ms/step - accuracy: 0.8150 - loss: 0.4894 - val_accuracy: 0.5986 - val_loss: 1.0926
Epoch 2/2
[1m795/795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 197ms/step - accuracy: 0.8548 - loss: 0.3756 - val_accuracy: 0.6004 - val_loss: 1.3894


In [14]:
# Train the model
epochs = 1
batch_size = 8
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=epochs,
    verbose=1
)

[1m3178/3178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 102ms/step - accuracy: 0.8422 - loss: 0.4121 - val_accuracy: 0.5895 - val_loss: 1.2963


In [15]:
# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 60ms/step - accuracy: 0.5902 - loss: 1.3090
Validation Loss: 1.2963
Validation Accuracy: 0.5895


In [17]:
# Save the trained model for later use
model_dir = "stance_detection_lstm.keras"  # Add .keras extension
model.save(model_dir)

In [18]:
# Save the model in a zip file
zip_file_name = f"{model_dir}.zip"
with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(model_dir):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, model_dir)
            zipf.write(file_path, arcname)

print(f"Model successfully saved and zipped as {zip_file_name}")

Model successfully saved and zipped as stance_detection_lstm.keras.zip


In [19]:
# Additional epochs with the same model for fine-tuning
fine_tuning_epochs = 5
fine_tune_history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=fine_tuning_epochs,
    verbose=1
)

Epoch 1/5
[1m3178/3178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 105ms/step - accuracy: 0.8793 - loss: 0.3088 - val_accuracy: 0.5935 - val_loss: 1.5001
Epoch 2/5
[1m3178/3178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 102ms/step - accuracy: 0.8970 - loss: 0.2563 - val_accuracy: 0.5848 - val_loss: 1.7145
Epoch 3/5
[1m3178/3178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 104ms/step - accuracy: 0.9072 - loss: 0.2227 - val_accuracy: 0.5895 - val_loss: 2.1487
Epoch 4/5
[1m3178/3178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 101ms/step - accuracy: 0.9204 - loss: 0.1883 - val_accuracy: 0.5758 - val_loss: 2.1180
Epoch 5/5
[1m3178/3178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 101ms/step - accuracy: 0.9271 - loss: 0.1718 - val_accuracy: 0.5772 - val_loss: 2.4117


In [21]:
# Save the fine-tuned model
fine_tuned_model_dir = "stance_detection_lstm_finetuned.keras" # Add .keras extension
model.save(fine_tuned_model_dir)

fine_tuned_zip_file = f"{fine_tuned_model_dir}.zip"
with zipfile.ZipFile(fine_tuned_zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(fine_tuned_model_dir):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, fine_tuned_model_dir)
            zipf.write(file_path, arcname)

print(f"Fine-tuned model successfully saved and zipped as {fine_tuned_zip_file}")

Fine-tuned model successfully saved and zipped as stance_detection_lstm_finetuned.keras.zip


In [22]:
# Make predictions on new data
new_comments = ["I completely agree!", "I don't think this is correct.", "Can you clarify more?"]
new_sequences = tokenizer.texts_to_sequences(new_comments)
new_padded = pad_sequences(new_sequences, maxlen=max_sequence_length)

predictions = model.predict(new_padded)
predicted_labels = label_encoder.inverse_transform(predictions.argmax(axis=1))

for comment, label in zip(new_comments, predicted_labels):
    print(f"Comment: {comment} --> Predicted Label: {label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step
Comment: I completely agree! --> Predicted Label: agree
Comment: I don't think this is correct. --> Predicted Label: query
Comment: Can you clarify more? --> Predicted Label: query
