In [None]:
import re
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import TFRobertaModel, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
import subprocess
import os

# Specify the download directory
nltk_data_dir = '/kaggle/working/nltk_data/'

# Create the directory if it doesn't exist
os.makedirs(nltk_data_dir, exist_ok=True)

# Add the directory to NLTK's data path
nltk.data.path.append(nltk_data_dir)

# Function to download and unzip NLTK resources
def download_and_unzip(resource):
    nltk.download(resource, download_dir=nltk_data_dir)
    zip_path = os.path.join(nltk_data_dir, 'corpora', f'{resource}.zip')
    if os.path.exists(zip_path):
        command = f"unzip -o {zip_path} -d {os.path.join(nltk_data_dir, 'corpora')}"
        subprocess.run(command.split())

# Download and unzip the necessary resources
resources = ['wordnet', 'averaged_perceptron_tagger', 'punkt']
for resource in resources:
    download_and_unzip(resource)

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize stopword set and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Load RoBERTa tokenizer and model
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
roberta_model = TFRobertaModel.from_pretrained(model_name)

# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions (@username)
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces

    # Tokenization and Stopword Removal
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords

    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

# Function to preprocess text (tokenization)
def preprocess_text(texts, max_length=128):
    cleaned_texts = [clean_text(text) for text in texts]  # Apply cleaning
    tokens = tokenizer(cleaned_texts, max_length=max_length, padding='max_length', truncation=True, return_tensors="tf")
    return {'input_ids': tokens['input_ids'], 'attention_mask': tokens['attention_mask']}

# Load dataset
df = pd.read_csv("/kaggle/input/twitter-us-airline/Twitter_US_Airline/Tweets.csv")  # Ensure file exists

# Encode sentiment labels (Negative=0, Neutral=1, Positive=2)
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['airline_sentiment'])

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['sentiment_encoded'], test_size=0.2, random_state=42)

# Tokenize and preprocess text data
train_inputs = preprocess_text(list(train_texts))
test_inputs = preprocess_text(list(test_texts))

# Convert labels to numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

# Define Attention Layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W = tf.keras.layers.Dense(units)
        self.b = tf.keras.layers.Dense(units)
        self.u = tf.keras.layers.Dense(1)

    def call(self, inputs):
        score = tf.nn.tanh(self.W(inputs) + self.b(inputs))
        attention_weights = tf.nn.softmax(self.u(score), axis=1)
        context_vector = attention_weights * inputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

# Define Ensemble Model: RoBERTa + LSTM + BiLSTM + GRU + Attention
class RobertaEnsembleModel(tf.keras.Model):
    def __init__(self, roberta_model, lstm_units=128, num_classes=3):
        super(RobertaEnsembleModel, self).__init__()
        self.roberta = roberta_model
        
        # Recurrent layers
        self.lstm = tf.keras.layers.LSTM(lstm_units, return_sequences=True)
        self.bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences=True))
        self.gru = tf.keras.layers.GRU(lstm_units, return_sequences=True)
        
        # Attention layers
        self.lstm_attention = Attention(lstm_units)
        self.bilstm_attention = Attention(lstm_units * 2)  # Bidirectional doubles the units
        self.gru_attention = Attention(lstm_units)

        # Fully connected layers
        self.concat = tf.keras.layers.Concatenate()
        self.dropout = tf.keras.layers.Dropout(0.3)
        self.dense = tf.keras.layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        roberta_outputs = self.roberta(inputs)[0]
        lstm_out = self.lstm(roberta_outputs)
        bilstm_out = self.bilstm(roberta_outputs)
        gru_out = self.gru(roberta_outputs)

        # Apply Attention to each recurrent layer
        lstm_attn = self.lstm_attention(lstm_out)
        bilstm_attn = self.bilstm_attention(bilstm_out)
        gru_attn = self.gru_attention(gru_out)

        # Merge outputs
        merged = self.concat([lstm_attn, bilstm_attn, gru_attn])
        dropout_out = self.dropout(merged)
        return self.dense(dropout_out)

# Initialize Model
num_classes = len(label_encoder.classes_)  # 3 classes (Negative, Neutral, Positive)
model = RobertaEnsembleModel(roberta_model, lstm_units=128, num_classes=num_classes)

# Compile Model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

# Train Model
model.fit(train_inputs, train_labels, epochs=50, batch_size=8, validation_data=(test_inputs, test_labels))

# Evaluate Model
loss, accuracy = model.evaluate(test_inputs, test_labels)
print(f"Test Accuracy: {accuracy:.4f}")

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data/...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /kaggle/working/nltk_data/...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /kaggle/working/nltk_data/...
[nltk_data]   Unzipping tokenizers/punkt.zip.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50

In [None]:
import re
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import TFRobertaModel, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
import subprocess
import os

# Specify the download directory
nltk_data_dir = '/kaggle/working/nltk_data/'

# Create the directory if it doesn't exist
os.makedirs(nltk_data_dir, exist_ok=True)

# Add the directory to NLTK's data path
nltk.data.path.append(nltk_data_dir)

# Function to download and unzip NLTK resources
def download_and_unzip(resource):
    nltk.download(resource, download_dir=nltk_data_dir)
    zip_path = os.path.join(nltk_data_dir, 'corpora', f'{resource}.zip')
    if os.path.exists(zip_path):
        command = f"unzip -o {zip_path} -d {os.path.join(nltk_data_dir, 'corpora')}"
        subprocess.run(command.split())

# Download and unzip the necessary resources
resources = ['wordnet', 'averaged_perceptron_tagger', 'punkt']
for resource in resources:
    download_and_unzip(resource)

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize stopword set and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Load RoBERTa tokenizer and model
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
roberta_model = TFRobertaModel.from_pretrained(model_name)

# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions (@username)
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces

    # Tokenization and Stopword Removal
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords

    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

# Function to preprocess text (tokenization)
def preprocess_text(texts, max_length=128):
    cleaned_texts = [clean_text(text) for text in texts]  # Apply cleaning
    tokens = tokenizer(cleaned_texts, max_length=max_length, padding='max_length', truncation=True, return_tensors="tf")
    return {'input_ids': tokens['input_ids'], 'attention_mask': tokens['attention_mask']}

# Load dataset
df = pd.read_csv("/kaggle/input/imdb-50k-movies/IMDb_50K_movies/IMDB Dataset.csv")  # Ensure file exists

# Encode sentiment labels (Negative=0, Neutral=1, Positive=2)
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['review'], df['sentiment_encoded'], test_size=0.2, random_state=42)

# Tokenize and preprocess text data
train_inputs = preprocess_text(list(train_texts))
test_inputs = preprocess_text(list(test_texts))

# Convert labels to numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

# Define Attention Layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W = tf.keras.layers.Dense(units)
        self.b = tf.keras.layers.Dense(units)
        self.u = tf.keras.layers.Dense(1)

    def call(self, inputs):
        score = tf.nn.tanh(self.W(inputs) + self.b(inputs))
        attention_weights = tf.nn.softmax(self.u(score), axis=1)
        context_vector = attention_weights * inputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

# Define Ensemble Model: RoBERTa + LSTM + BiLSTM + GRU + Attention
class RobertaEnsembleModel(tf.keras.Model):
    def __init__(self, roberta_model, lstm_units=128, num_classes=3):
        super(RobertaEnsembleModel, self).__init__()
        self.roberta = roberta_model
        
        # Recurrent layers
        self.lstm = tf.keras.layers.LSTM(lstm_units, return_sequences=True)
        self.bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences=True))
        self.gru = tf.keras.layers.GRU(lstm_units, return_sequences=True)
        
        # Attention layers
        self.lstm_attention = Attention(lstm_units)
        self.bilstm_attention = Attention(lstm_units * 2)  # Bidirectional doubles the units
        self.gru_attention = Attention(lstm_units)

        # Fully connected layers
        self.concat = tf.keras.layers.Concatenate()
        self.dropout = tf.keras.layers.Dropout(0.3)
        self.dense = tf.keras.layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        roberta_outputs = self.roberta(inputs)[0]
        lstm_out = self.lstm(roberta_outputs)
        bilstm_out = self.bilstm(roberta_outputs)
        gru_out = self.gru(roberta_outputs)

        # Apply Attention to each recurrent layer
        lstm_attn = self.lstm_attention(lstm_out)
        bilstm_attn = self.bilstm_attention(bilstm_out)
        gru_attn = self.gru_attention(gru_out)

        # Merge outputs
        merged = self.concat([lstm_attn, bilstm_attn, gru_attn])
        dropout_out = self.dropout(merged)
        return self.dense(dropout_out)

# Initialize Model
num_classes = len(label_encoder.classes_)  # 3 classes (Negative, Neutral, Positive)
model = RobertaEnsembleModel(roberta_model, lstm_units=128, num_classes=num_classes)

# Compile Model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

# Train Model
model.fit(train_inputs, train_labels, epochs=50, batch_size=8, validation_data=(test_inputs, test_labels))

# Evaluate Model
loss, accuracy = model.evaluate(test_inputs, test_labels)
print(f"Test Accuracy: {accuracy:.4f}")

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data/...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /kaggle/working/nltk_data/...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /kaggle/working/nltk_data/...
[nltk_data]   Unzipping tokenizers/punkt.zip.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50