In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import accuracy_score

In [2]:
# Load the datasets
train_data = pd.read_csv('tweets-ext.csv')
test_data = pd.read_csv('tweets-test.csv')
val_data = pd.read_csv('tweets-valid.csv')

In [3]:
# Preprocess the data
X_train = train_data['tweet']
y_train = train_data['label']
X_test = test_data['tweet']
y_test = test_data['label']
X_val = val_data['tweet']
y_val = val_data['label']

In [4]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [5]:
# Convert text sequences to numerical sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)
val_sequences = tokenizer.texts_to_sequences(X_val)

In [6]:
# Pad the sequences to a fixed length
max_seq_length = 280
train_X = pad_sequences(train_sequences, maxlen=max_seq_length)
test_X = pad_sequences(test_sequences, maxlen=max_seq_length)
val_X = pad_sequences(val_sequences, maxlen=max_seq_length)

In [7]:
# Define the ensemble models
num_models = 5
model_list = []

In [8]:
for _ in range(num_models):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_seq_length))
    model.add(LSTM(units=128))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(train_X, y_train, validation_data=(val_X, y_val), epochs=10, batch_size=32)
    model_list.append(model)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
# Generate predictions for validation data
val_predictions = []
for model in model_list:
    predictions = model.predict(val_X)
    predicted_classes = np.argmax(predictions, axis=1)
    val_predictions.append(predicted_classes)



In [11]:
# Combine predictions using majority voting
ensemble_val_predictions = np.round(np.mean(val_predictions, axis=0))

In [12]:
# Evaluate the ensemble predictions
ensemble_val_accuracy = accuracy_score(y_val, ensemble_val_predictions)

In [13]:
print("Ensemble Validation Accuracy:", ensemble_val_accuracy)

Ensemble Validation Accuracy: 0.3333333333333333


In [14]:
# Generate predictions for test data
test_predictions = []
for model in model_list:
    predictions = model.predict(test_X)
    predicted_classes = np.argmax(predictions, axis=1)
    test_predictions.append(predicted_classes)



In [15]:
# Combine predictions using majority voting
ensemble_test_predictions = np.round(np.mean(test_predictions, axis=0))

In [16]:
# Evaluate the ensemble predictions
ensemble_test_accuracy = accuracy_score(y_test, ensemble_test_predictions)

In [17]:
print("Ensemble Test Accuracy:", ensemble_test_accuracy)

Ensemble Test Accuracy: 0.3333333333333333
