In [3]:
!pip install fasttext



In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
from tensorflow.keras.utils import to_categorical
from sklearn.utils import shuffle
import fasttext

In [2]:
# Load the datasets
train_data = pd.read_csv('tweets-ext.csv')
test_data = pd.read_csv('tweets-test.csv')
val_data = pd.read_csv('tweets-valid.csv')

In [3]:
# Preprocess the data
X_train = train_data['tweet']
y_train = train_data['label']
X_test = test_data['tweet']
y_test = test_data['label']
X_val = val_data['tweet']
y_val = val_data['label']

In [4]:
# Preprocess the data
train_text = train_data['tweet'].tolist()
train_labels = train_data['label'].tolist()
test_text = test_data['tweet'].tolist()
test_labels = test_data['label'].tolist()
val_text = val_data['tweet'].tolist()
val_labels = val_data['label'].tolist()

In [5]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text + test_text + val_text)

In [6]:
# Convert text sequences to numerical sequences
train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)
val_sequences = tokenizer.texts_to_sequences(val_text)

In [7]:
# Pad the sequences to a fixed length
max_seq_length = 280
train_X = pad_sequences(train_sequences, maxlen=max_seq_length)
test_X = pad_sequences(test_sequences, maxlen=max_seq_length)
val_X = pad_sequences(val_sequences, maxlen=max_seq_length)

In [8]:
# Convert labels to categorical format
num_classes = len(set(train_labels))
train_y = to_categorical(train_labels, num_classes=num_classes)
test_y = to_categorical(test_labels, num_classes=num_classes)
val_y = to_categorical(val_labels, num_classes=num_classes)

In [9]:
# Load FastText word embeddings for Marathi
embedding_path = 'wiki.mr.bin'
embedding_model = fasttext.load_model(embedding_path)



In [10]:
# Create the embedding matrix
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_dim = embedding_model.get_dimension()
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if word in embedding_model:
        embedding_matrix[i] = embedding_model[word]

In [11]:
# Define the ensemble models
num_models = 5
model_list = []

In [12]:
# Train individual models
for _ in range(num_models):
    model = Sequential()
    model.add(Embedding(num_words, embedding_dim, input_length=max_seq_length, weights=[embedding_matrix], trainable=False))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    model.fit(train_X, train_y, validation_data=(val_X, val_y), epochs=10, batch_size=32)
    model_list.append(model)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
# Generate predictions for validation data
val_predictions = []
for model in model_list:
    predictions = model.predict(val_X)
    predicted_classes = np.argmax(predictions, axis=1)
    val_predictions.append(predicted_classes)



In [30]:
# Combine predictions using majority voting
ensemble_val_predictions = np.round(np.mean(val_predictions, axis=0))

In [31]:
# Evaluate the ensemble predictions
ensemble_val_accuracy = accuracy_score(np.argmax(val_y, axis=1), ensemble_val_predictions)

In [32]:
print("Ensemble Validation Accuracy:", ensemble_val_accuracy)

Ensemble Validation Accuracy: 0.7826666666666666


In [25]:
# Generate predictions for test data
test_predictions = []
for model in model_list:
    predictions = model.predict(test_X)
    predicted_classes = np.argmax(predictions, axis=1)
    test_predictions.append(predicted_classes)



In [26]:
# Combine predictions using majority voting
ensemble_test_predictions = np.round(np.mean(test_predictions, axis=0))

In [27]:
# Evaluate the ensemble predictions
ensemble_test_accuracy = accuracy_score(np.argmax(test_y, axis=1), ensemble_test_predictions)

In [28]:
print("Ensemble Test Accuracy:", ensemble_test_accuracy)


Ensemble Test Accuracy: 0.7924444444444444
