In [1]:
# Import basic libraries
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import pandas as pd
import pandas as pd

In [2]:
# Load data
df = pd.read_csv('/home/hari/Documents/MLAI/Datasets/tweet_product_company.csv',encoding = 'latin-1')

In [3]:
# Display a part of the data
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
# Select the necessary columns.
messages=df[['tweet_text','is_there_an_emotion_directed_at_a_brand_or_product']].copy()
messages.columns=['text','response']

#### Data Preprocessing

In [5]:
messages.dropna(axis=0,inplace=True)

In [None]:
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from gensim.models import KeyedVectors
import gensim.downloader

# Download and load GloVe embeddings (100-dimensional vectors)
word_embeddings = gensim.downloader.load('glove-wiki-gigaword-100')

In [11]:
import re
import nltk
import numpy as np
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@[^\s]+', '', text)
    text = re.sub(r'#', '', text)
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    return text

In [12]:
preprocessed_texts = [preprocess_text(text) for text in messages['text']]

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_texts)

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(preprocessed_texts)

# Pad sequences to a fixed length
max_sequence_length = 20
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Create an embedding matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in word_embeddings:
        embedding_matrix[i] = word_embeddings[word]

# Build the Embedding layer using the embedding matrix
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_sequence_length,
    trainable=False
)

In [14]:
# Label encoding
class_mapping = {
    'Positive emotion': 2,
    'Negative emotion': 0,
    'No emotion toward brand or product': 1,
    "I can't tell": 3
}
messages['response'] = messages['response'].map(class_mapping)

In [15]:
# One hot encoding
from tensorflow.keras.utils import to_categorical
y_one_hot = to_categorical(messages['response'], num_classes=4)

In [16]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y_one_hot , test_size=0.2, random_state=42)

In [17]:
# Converting train and test to tensors
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense

# Create a model
model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(4, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 100)           1032500   
                                                                 
 bidirectional (Bidirectiona  (None, 20, 256)          234496    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 20, 256)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              164352    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 32)                4

In [19]:
# Define early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor="val_loss", patience=5)

In [20]:
# Train the model
model.fit(train_dataset, epochs=10, validation_data=test_dataset,callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fdd4e966050>

In [21]:
# Display the accuracy score
_, accuracy = model.evaluate(X_test, y_test)
print("Final accuracy: ", accuracy)

Final accuracy:  0.6663001775741577


The final accuracy score is **66.6%**.