In [6]:
pip install -U transformers



In [1]:
# Create a simpleRNN or LSTM based classifiers to classify tweets into the four classes

import pandas as pd

In [3]:
df = pd.read_csv('/content/judge-1377884607_tweet_product_company.csv', encoding='latin-1')
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [14]:
x = df['tweet_text']
y = df['is_there_an_emotion_directed_at_a_brand_or_product']

In [17]:
# split train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

# Fill missing values with empty strings
X_train = X_train.fillna('')
X_test = X_test.fillna('')

# Function to clean text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Replace newlines with spaces
    text = text.replace('\n', ' ')
    # Remove characters that are not letters or spaces
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Apply cleaning to the text data
X_train_cleaned = X_train.apply(clean_text)
X_test_cleaned = X_test.apply(clean_text)


# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_cleaned)

# Convert text to sequences of integers
X_train_sequences = tokenizer.texts_to_sequences(X_train_cleaned)
X_test_sequences = tokenizer.texts_to_sequences(X_test_cleaned)

# Pad sequences to a maximum length of 300
max_length = 300
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')

print("Shape of X_train_padded:", X_train_padded.shape)
print("Shape of X_test_padded:", X_test_padded.shape)

Shape of X_train_padded: (7274, 300)
Shape of X_test_padded: (1819, 300)


In [21]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert to one-hot encoding
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)

print("Shape of y_train_categorical:", y_train_categorical.shape)
print("Shape of y_test_categorical:", y_test_categorical.shape)

Shape of y_train_categorical: (7274, 4)
Shape of y_test_categorical: (1819, 4)


In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding

# Get vocabulary size from tokenizer
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(vocab_size, 10, input_shape=(max_length,)))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(60))
model.add(Dense(50, activation='relu'))
model.add(Dense(y_train_categorical.shape[1], activation='softmax')) # Number of units equals number of classes

  super().__init__(**kwargs)


In [24]:
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics=['accuracy'])

In [26]:
model.fit(X_train_padded, y_train_categorical, epochs=20, batch_size=128)

Epoch 1/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 977ms/step - accuracy: 0.5467 - loss: 1.1216
Epoch 2/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 990ms/step - accuracy: 0.5919 - loss: 0.9121
Epoch 3/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 994ms/step - accuracy: 0.5951 - loss: 0.9209
Epoch 4/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 1s/step - accuracy: 0.5879 - loss: 0.9300
Epoch 5/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 1s/step - accuracy: 0.6031 - loss: 0.9131
Epoch 6/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 1s/step - accuracy: 0.5863 - loss: 0.9267
Epoch 7/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 1s/step - accuracy: 0.5974 - loss: 0.9125
Epoch 8/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 1s/step - accuracy: 0.5916 - loss: 0.9165
Epoch 9/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7dd0ca26a990>