In [1]:
# Import basic libraries
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import pandas as pd
import pandas as pd

In [2]:
# Load data
df = pd.read_csv('/home/hari/Documents/MLAI/Datasets/tweet_product_company.csv',encoding = 'latin-1')

In [3]:
# Display a part of the data
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
# Select the necessary columns
messages=df[['tweet_text','is_there_an_emotion_directed_at_a_brand_or_product']].copy()
messages.columns=['text','response']

### Data Preprocessing

In [5]:
messages.dropna(axis=0,inplace=True)

In [6]:
import re
def remove_hashtags_mentions(text):
    cleaned_text = re.sub(r'#\w+', '', text)
    cleaned_text = re.sub(r'@[\w]*', '', cleaned_text)

    return cleaned_text

def remove_urls(text):
    url_pattern = re.compile(r'http?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
    
import string
def remove_punc(text):
    punc_free = ''.join([i for i in text if i not in string.punctuation])
    return punc_free

import nltk
def tokenization(text):
    words = nltk.word_tokenize(text)
    return words

stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output

from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
def lemm(text):
    lemm_text = [wordnet_lem.lemmatize(word) for word in text]
    return lemm_text

In [7]:
def preprocess(df_col):
    corpus = []
    for item in df_col:
        new_item = remove_hashtags_mentions(item)
        new_item = remove_urls(new_item)
        new_item = remove_punc(item)
        new_item = new_item.lower()
        new_item = tokenization(new_item)
        new_item = remove_stopwords(new_item)
        new_item = lemm(new_item)
        corpus.append(' '.join(str(x) for x in new_item))
    return corpus

In [8]:
# Clean up the text
corpus = preprocess(messages['text'])

In [9]:
# Label encoding
class_mapping = {
    'Positive emotion': 2,
    'Negative emotion': 0,
    'No emotion toward brand or product': 1,
    "I can't tell": 3
}
messages['response'] = messages['response'].map(class_mapping)

In [10]:
# One hot encoding
from tensorflow.keras.utils import to_categorical
y_one_hot = to_categorical(messages['response'], num_classes=4)

In [11]:
# Split the data into train and test.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y_one_hot , test_size=0.2, random_state=42)

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
# Tokenize text and convert to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [14]:
# Pad sequences to have the same length
max_sequence_length = max(len(seq) for seq in X_train_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

In [15]:
# Convert the training data to tensors
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_padded, y_train)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_padded, y_test)).batch(32)

### LSTM

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Flatten

In [17]:
# Create a mdel and display the summary
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(Dropout(0.2))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Flatten())
model.add(Dense(4, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 26, 100)           918300    
                                                                 
 dropout (Dropout)           (None, 26, 100)           0         
                                                                 
 lstm (LSTM)                 (None, 26, 128)           117248    
                                                                 
 dropout_1 (Dropout)         (None, 26, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 flatten (Flatten)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 4)                 2

In [18]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
# Define early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor="val_loss", patience=5)

In [20]:
# Train the model
model.fit(train_dataset, epochs=10, validation_data=test_dataset,callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<keras.callbacks.History at 0x7f708398f710>

In [21]:
# Display the accuray score
_, accuracy = model.evaluate(X_test_padded, y_test)
print("Final accuracy: ", accuracy)

Final accuracy:  0.6470588445663452


The final accuracy score is **64.7%**.