In [43]:
# %tensorflow_version 2.x

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tweepy as tw
import re
import time

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from tensorflow.keras.layers import TextVectorization, Normalization, Flatten, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /home/rf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Set dataframe column width
pd.set_option('max_colwidth', 1500)

In [5]:
df = pd.read_csv("./Emotions Dataset.csv", header=0)

In [6]:
df.head()

Unnamed: 0,tweet,sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


# Dataset Preprocessing

In [7]:
# Function encoding each label

def label_encode(data,label):
    labels=data[label].map(
    {
        "joy":0,
        "sadness":1,
        "anger":2,
        "fear":3,
        "love":4,
        "surprise":5
    }
    )
    return labels

In [8]:
df["label"] = label_encode(df, "sentiment")

In [9]:
df

Unnamed: 0,tweet,sentiment,label
0,i didnt feel humiliated,sadness,1
1,i can go from feeling so hopeless to so damned...,sadness,1
2,im grabbing a minute to post i feel greedy wrong,anger,2
3,i am ever feeling nostalgic about the fireplac...,love,4
4,i am feeling grouchy,anger,2
...,...,...,...
19995,im having ssa examination tomorrow in the morn...,sadness,1
19996,i constantly worry about their fight against n...,joy,0
19997,i feel its important to share this info for th...,joy,0
19998,i truly feel that if you are passionate enough...,joy,0


In [28]:
# Set vocab size and maximum sentence length
vocab_size = 10000
max_length = 280

In [13]:
# Function for tokenisation, one_hot encoding and padding

def data_preparation(data,description):
    stemmer=PorterStemmer()
    
    corpus=[]
    
    for text in data[description]:
        text=re.sub("[^a-zA-Z]"," ",text)
        text=text.lower()
        text=text.split()
        
        text=[stemmer.stem(words)
             for words in text
              if words not in stopwords.words("english")
             ]
        text=" ".join(text)
        corpus.append(text)
        
    oneHot_doc=[one_hot(input_text=words,n=vocab_size)
               for words in corpus
               ]
    
    padded_doc=pad_sequences(sequences=oneHot_doc,
                              maxlen=max_length,
                              padding="pre")
    return padded_doc

In [14]:
X = data_preparation(df, "tweet")

In [15]:
y = df["label"]

# Train Test Split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

# Check train, test, val set size
print(len(x_train), 'train examples')
print(len(x_test), 'test examples')
print(len(x_val), 'validation examples')

12800 train examples
4000 test examples
3200 validation examples


# Build Model and Hyperparameter Tune

In [None]:
# Model builder function for hyperparameter tuning

def model_builder(hp):
    
    # Initial Model
    model = keras.Sequential()

    # Embedding layer
    model.add(Embedding(input_dim=vocab_size, 
                          output_dim=hp.Int("output_dim:", min_value=40, max_value=120, step=10), input_length=max_length))

    model.add(LSTM(units=128))

    # Tune number of dense layers, layer units and dropout rate

    for j in range(hp.Int("Dense Layers", min_value=1, max_value=5, step=1)):
        model.add(Dense(units=hp.Int("units_"+str(j), min_value=32, max_value=256, step=32),
            activation="relu", kernel_initializer=hp.Choice("kernel_init"+str(j), values=["he_uniform","he_normal"]))   
        )
    model.add(Dropout(rate=hp.Float("drop_rate"+str(j), min_value=0.1, max_value=0.5, step=0.1))
        )
  
    # Output layer
    model.add(Dense(6, activation="softmax"))

    # Tune the learning rate for the optimizer
    model.compile(optimizer=Adam(learning_rate=hp.Choice("learnRate", values=[0.01,0.001,0.0001])),
            loss="sparse_categorical_crossentropy", metrics=["accuracy"]
        )

    return model

In [None]:
# Instantiate the tuner
tuner=kt.tuners.RandomSearch(
    model_builder,
    objective="val_accuracy",
    max_trials=2,
    executions_per_trial=2,
    directory="twitter_NLP2",
    project_name="hypertuningNLP"
    )

In [None]:
# Early Stopping
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
# Perforamnce Hyperparameter tuning
tuner.search(x_train, y_train, epochs=5, validation_data=(x_val, y_val), callbacks=[stop_early])

In [None]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print("Dense Layers: ", best_hps.get("Dense Layers"))
print("drop_rate0: ", best_hps.get("drop_rate0"))
print("learnRate: ", best_hps.get("learnRate"))
print("units_0: ", best_hps.get("units_0"))

# Optimal Model

**Using Tuner Settings**

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)

In [None]:
history = model.fit(x_train, y_train, epochs=50, validation_data=(x_val, y_val), callbacks=[stop_early])

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model with best number of epochs
hypermodel.fit(x_train, y_train, epochs=best_epoch, validation_data=(x_val, y_val), callbacks=[stop_early])

In [None]:
# Evaluate the model on the test set
eval_result = hypermodel.evaluate(x_test, y_test)

In [None]:
print(f"test loss: {eval_result[0]:.4f}")
print(f"test accuracy: {eval_result[1]:.4f}")

**Manual Settings**

In [34]:
# Optimal Model Function
def optimal_model_builder():
    model = keras.Sequential()
    model.add(Embedding(vocab_size, 8, input_length=max_length))
    model.add(LSTM(units=128))
    model.add(Dense(6, activation="softmax"))
    model.compile(optimizer=Adam(learning_rate=0.001),
            loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

In [36]:
model = optimal_model_builder()
model.fit(x_train, y_train, epochs=2, validation_data=(x_val, y_val))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fede829d8d0>

In [37]:
loss, accuracy = model.evaluate(x_test, y_test)
print('Accuracy: ',accuracy*100)
print('Loss: ',loss)

Accuracy:  70.09999752044678
Loss:  0.750033974647522


# Predictions on new tweets

In [38]:
# Set up tweepy authorisation

api_key = "X"
api_key_secret = "X"
access_token = "X"
access_token_secret = "X"

auth = tw.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth,wait_on_rate_limit=True)

In [57]:
# Function for different twitter queries
def twitter_search(query_term):
    
    # Function takes a search term to use as a query
    # Excxlude retweets and replies
    date_since = "2021-01-01"
    search_words = (query_term+f" -filter:retweets -filter:replies since:{date_since}")
    count = 10

    try:
        # Collect tweets
        tweets = tw.Cursor(api.search_tweets,
                      q=search_words,
                      lang="en").items(count)


        # Collect tweets, username, location and timestamp
        tweet_info = [[tweet.text, tweet.user.screen_name, tweet.user.location, tweet.created_at] for tweet in tweets]

        # Assing collected information to a dataframe 
        tweet_df = pd.DataFrame(data=tweet_info, 
                            columns=['tweet', 'user', 'location', 'timestamp'])

        tweet_df['timestamp'] = tweet_df['timestamp'].dt.strftime('%d-%m-%Y %H:%M:%S')


    # Incase of exception
    except BaseException as e:
            print('failed on_status,',str(e))
            time.sleep(3)

    return tweet_df

# Sentiment analysis using a single, user inputted sentence/phrase

In [52]:
def predict_new_sentence(model):
    
    new_sentence = input("Enter a sentence: ")

    tweet_df = pd.DataFrame([new_sentence], columns=['tweet'])
    tweet_df["prediction"] = ""

    X = data_preparation(tweet_df, "tweet")

    prediction = model.predict(X)

    y_pred=[np.argmax(label) for label in prediction]

    predict=pd.DataFrame(y_pred, columns=["Predicted"])

    predict["Predicted Label"]=predict["Predicted"].map(
          {0:"joy", 1:"sadness", 2:"anger", 3:"fear", 4:"love", 5:"surprise"})

    predict_df=pd.concat([tweet_df["tweet"], predict["Predicted Label"]],
                        axis=1)

    return predict_df

In [53]:
predict_new_sentence(model)

Enter a sentence:  Hello I am very happy today


Unnamed: 0,tweet,Predicted Label
0,Hello I am very happy today,joy


# Sentiment analysis for a dataframe of sentences/phrases (i.e. tweets), thus combining the twitter api search function and the NLP moodel

Firstly using a predefined dataframe

In [58]:
def predict_tweets(model, dataframe):
    X = data_preparation(tweet_df, "tweet")

    predictions = []

    for tweet in X:

        prediction = model.predict(X)

        y_pred=[np.argmax(label) for label in prediction]

        predict=pd.DataFrame(y_pred, columns=["Predicted"])

        predict["Predicted Label"] =  predict["Predicted"].map(
            {0:"joy", 1:"sadness", 2:"anger", 3:"fear", 4:"love", 5:"surprise"})


    predict_df=pd.concat([tweet_df["tweet"], predict["Predicted Label"]],
                      axis=1)

    return predict_df

In [None]:
predict_tweets(model, tweet_df)

Secondly, combining previous functions to allow sentiment analysis for a query term passed to the function

In [85]:
def search_and_predict_tweets(query, model):

    tweet_df = twitter_search(query)
    X = data_preparation(tweet_df, "tweet")

    predictions = []

    for tweet in X:

        prediction = model.predict(X)

        y_pred=[np.argmax(label) for label in prediction]

        predict=pd.DataFrame(y_pred, columns=["Predicted"])

        predict["Predicted Label"] =  predict["Predicted"].map(
            {0:"joy", 1:"sadness", 2:"anger", 3:"fear", 4:"love", 5:"surprise"})


    predict_df=pd.concat([tweet_df["tweet"], predict["Predicted Label"]],
                      axis=1)
    print("Most Common Emotion(s) expressed:", predict_df["Predicted Label"].mode()[0])
    print("")
    
    return predict_df

In [86]:
search_and_predict_tweets('Wolves', model)

Most Common Emotion(s) expressed: anger



Unnamed: 0,tweet,Predicted Label
0,"Got 2 days off, gonna start new design of my w...",anger
1,"""Everyone dies one day. Everyone. Even wolves....",anger
2,Oregon police seek clues in poisoning of eight...,sadness
3,Wolves https://t.co/CljMDtAj4R,anger
4,"A total of 8 wolves, including an entire pack,...",sadness
5,"Up next, a trip to @Wolves 👊 #WOLLIV ❤️🥰 https...",love
6,The Wolf telling the Sheep how to avoid Wolves...,anger
7,Oregon officials ask public help to find kille...,sadness
8,Inside you there are two wolves https://t.co/G...,joy
9,Hey folks\nI've made the decision to keep writ...,joy
