In [1]:
!git clone https://github.com/cardiffnlp/tweeteval

Cloning into 'tweeteval'...
remote: Enumerating objects: 370, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 370 (delta 13), reused 3 (delta 1), pack-reused 354 (from 1)[K
Receiving objects: 100% (370/370), 8.49 MiB | 27.43 MiB/s, done.
Resolving deltas: 100% (122/122), done.


In [2]:
!pip install keras tensorflow

!pip install nltk



In [3]:
import numpy as np
import pandas as pd
import os

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer # Changed import statement to use tensorflow.keras
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [4]:
# Define the dataset folder path
dataset_path = "//kaggle/working/tweeteval/datasets/emoji"  # Replace this with the actual path if different

# Function to load texts and labels from a file
def load_texts_and_labels(text_file, label_file):
    # Read texts
    with open(text_file, "r", encoding="utf-8") as f:
        texts = f.read().splitlines()

    # Read labels
    with open(label_file, "r", encoding="utf-8") as f:
        labels = f.read().splitlines()

    # Return a list of tuples (text, label)
    return list(zip(texts, labels))

# Dictionary to hold the dataset
data = {}

# Load train, validation, and test sets
data["train"] = load_texts_and_labels(
    os.path.join(dataset_path, "train_text.txt"),
    os.path.join(dataset_path, "train_labels.txt")
)
data["val"] = load_texts_and_labels(
    os.path.join(dataset_path, "val_text.txt"),
    os.path.join(dataset_path, "val_labels.txt")
)
data["test"] = load_texts_and_labels(
    os.path.join(dataset_path, "test_text.txt"),
    os.path.join(dataset_path, "val_labels.txt")  # Assuming test labels are in `val_labels.txt`
)

In [5]:
# Convert data into DataFrames for train, validation, and test sets
def convert_to_dataframe(data_split):
    return pd.DataFrame(data_split, columns=["TEXT", "Label"])

# Create DataFrames for train, validation, and test sets
train_data = convert_to_dataframe(data["train"])
val_data = convert_to_dataframe(data["val"])
test_data = convert_to_dataframe(data["test"])

# Display the first few rows of each DataFrame to confirm the format
print("Training Data:")
print(train_data.head())

print("\nValidation Data:")
print(val_data.head())

print("\nTest Data:")
print(test_data.head())


Training Data:
                                                TEXT Label
0  Sunday afternoon walking through Venice in the...    12
1  Time for some BBQ and whiskey libations. Chomp...    19
2  Love love love all these people ️ ️ ️ #friends...     0
3                               ️ ️ ️ ️ @ Toys"R"Us      0
4  Man these are the funniest kids ever!! That fa...     2

Validation Data:
                                                TEXT Label
0  A little throwback with my favourite person @ ...     0
1  glam on @user yesterday for #kcon makeup using...     7
2  Democracy Plaza in the wake of a stunning outc...    11
3   Then &amp; Now. VILO @ Walt Disney Magic Kingdom     0
4               Who never... @ A Galaxy Far Far Away     2

Test Data:
                                                TEXT Label
0                                  en Pelham Parkway     0
1  The calm before...... | w/ sofarsounds @user |...     7
2  Just witnessed the great solar eclipse @ Tampa...    11
3  This lit

In [6]:
mapping_path = "/kaggle/working/tweeteval/datasets/emoji/mapping.txt"

# Load the mapping file into a DataFrame
# Assuming mapping.txt is formatted as tab-separated, with each line as: Index Emoji Description
mapping_df = pd.read_csv(mapping_path, sep="\t", header=None, names=["index", "emoticons", "description"])

# Add a numeric "number" column that matches the row index
mapping_df["number"] = mapping_df.index

# Rename columns to match the required format
mapping_df = mapping_df.rename(columns={"index": "Unnamed: 0"})

# Drop the "description" column if it's not needed
mappings= mapping_df[["Unnamed: 0", "emoticons", "number"]]

# Display the first few rows to verify
print(mappings.head())

  Unnamed: 0                      emoticons  number
0          ❤                    _red_heart_       0
1          😍  _smiling_face_with_hearteyes_       1
2          😂       _face_with_tears_of_joy_       2
3          💕                   _two_hearts_       3
4          🔥                         _fire_       4


In [7]:
train_data.shape, test_data.shape, mappings.shape

((45000, 2), (5000, 2), (20, 3))

In [8]:
train_length = train_data.shape[0]
test_length = test_data.shape[0]
train_length, test_length

(45000, 5000)

In [9]:
import nltk

# Download the 'stopwords' dataset
nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = stopwords.words("english")
stop_words[:5]

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we']

In [10]:
# tokenize the sentences
def tokenize(tweets):
    stop_words = stopwords.words("english")
    tokenized_tweets = []
    for tweet in tweets:
        # split all words in the tweet
        words = tweet.split(" ")
        tokenized_string = ""
        for word in words:
            # remove @handles -> useless -> no information
            if word and word[0] != '@' and word not in stop_words:
                # if a hashtag, remove # -> adds no new information
                if word[0] == "#":
                    word = word[1:]
                tokenized_string += word + " "
        tokenized_tweets.append(tokenized_string)
    return tokenized_tweets

In [11]:
# translate tweets to a sequence of numbers
def encod_tweets(tweets):
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=" ", lower=True)
    tokenizer.fit_on_texts(tweets)
    return tokenizer, tokenizer.texts_to_sequences(tweets)

In [12]:
# apply padding to dataset and convert labels to bitmaps
def format_data(encoded_tweets, max_length, labels):
    x = pad_sequences(encoded_tweets, maxlen= max_length, padding='post', truncating='post')
    y = []
    for emoji in labels:
        emoji_index = int(emoji)
        bit_vec = np.zeros(20)
        bit_vec[emoji_index] = 1
        y.append(bit_vec)
    y = np.asarray(y)
    return x, y

In [13]:
# create weight matrix from pre trained embeddings
def create_weight_matrix(vocab, raw_embeddings):
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 300))
    for word, idx in vocab.items():
        if word in raw_embeddings:
            weight_matrix[idx] = raw_embeddings[word]
    return weight_matrix

In [14]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

def final_model(weight_matrix, vocab_size, max_length, x_train, y_train, x_val, y_val, embedding_dim=300, lstm_units=128, epochs=5, learning_rate=0.001):
    # Embedding layer with provided weight matrix
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[weight_matrix],
        input_length=max_length,
        trainable=True,
        mask_zero=False
    )

    # Model architecture
    model = Sequential()
    model.add(embedding_layer)

    model.add(Dropout(0.3))
    model.add(Dense(32,activation='relu'))
    model.add((LSTM(64,return_sequences= True)))
    model.add((LSTM(64,return_sequences= True)))
    model.add((LSTM(64,return_sequences= True)))
    model.add((LSTM(64,return_sequences= True)))
    model.add((LSTM(32)))
    model.add(Dense(32,activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(20, activation='softmax'))



    # Compile model with a customizable learning rate
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    # Callback to reduce learning rate on plateau
    lr_reduction = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

    # Train model with validation data
    model.fit(
        x_train, y_train,
        epochs=epochs,
        validation_data=(x_val, y_val),
        callbacks=[lr_reduction]
    )

    # Evaluate model on validation set
    score, acc = model.evaluate(x_val, y_val)

    return model, score, acc


In [16]:
import math

In [17]:
tokenized_tweets = tokenize(train_data['TEXT'])
tokenized_tweets += tokenize(test_data['TEXT'])
max_length = math.ceil(sum([len(s.split(" ")) for s in tokenized_tweets])/len(tokenized_tweets))
tokenizer, encoded_tweets = encod_tweets(tokenized_tweets)
max_length, len(tokenized_tweets)

(10, 50000)

In [18]:
x, y = format_data(encoded_tweets[:train_length], max_length, train_data['Label'])
len(x), len(y)

(45000, 45000)

In [19]:
x_test, y_test = format_data(encoded_tweets[train_length:], max_length, test_data['Label'])
len(x_test), len(y_test)

(5000, 5000)

In [None]:
vocab = tokenizer.word_index
# vocab, len(vocab)

In [20]:
from gensim.models.keyedvectors import KeyedVectors

In [None]:
word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/swm-wordembed/model_swm_300-6-10-low.w2v', binary=False)

# Create the weight matrix
weight_matrix = create_weight_matrix(vocab, word2vec_model)

In [None]:
model, score= final_model(weight_matrix, len(vocab)+1, max_length, x, y, x_test, y_test, epochs = 50)
model, score

In [None]:
model.summary()