In [None]:
%tensorflow_version 2.x

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tweepy as tw
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from tensorflow.keras.layers import TextVectorization, Normalization, Flatten, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
!pip install -q -U keras-tuner
import keras_tuner as kt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[K     |████████████████████████████████| 97 kB 4.1 MB/s 
[?25h

In [None]:
pd.set_option('max_colwidth', 1500)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Twitter NLP Project/Emotions Dataset.csv", header=0)

In [None]:
df.head()

Unnamed: 0,tweet,sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


Dataset Preprocessing

In [None]:
# Encoding each label
def label_encode(data,label):
    labels=data[label].map(
    {
        "joy":0,
        "sadness":1,
        "anger":2,
        "fear":3,
        "love":4,
        "surprise":5
    }
    )
    return labels

In [None]:
df["label"] = label_encode(df, "sentiment")

In [None]:
df

Unnamed: 0,tweet,sentiment,label
0,i didnt feel humiliated,sadness,1
1,i can go from feeling so hopeless to so damned...,sadness,1
2,im grabbing a minute to post i feel greedy wrong,anger,2
3,i am ever feeling nostalgic about the fireplac...,love,4
4,i am feeling grouchy,anger,2
...,...,...,...
19995,im having ssa examination tomorrow in the morn...,sadness,1
19996,i constantly worry about their fight against n...,joy,0
19997,i feel its important to share this info for th...,joy,0
19998,i truly feel that if you are passionate enough...,joy,0


In [None]:
vocab_size = 10000
max_length = 280

In [None]:
# Function for tokenisation, one_hot encoding and embedding

def data_preparation(data,description):
    stemmer=PorterStemmer()
    
    corpus=[]
    
    for text in data[description]:
        text=re.sub("[^a-zA-Z]"," ",text)
        text=text.lower()
        text=text.split()
        
        text=[stemmer.stem(words)
             for words in text
              if words not in stopwords.words("english")
             ]
        text=" ".join(text)
        corpus.append(text)
        
    oneHot_doc=[one_hot(input_text=words,n=vocab_size)
               for words in corpus
               ]
    
    embedded_doc=pad_sequences(sequences=oneHot_doc,
                              maxlen=max_length,
                              padding="pre")
    return embedded_doc

In [None]:
X = data_preparation(df, "tweet")

In [None]:
y = df["label"]

Train Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

print(len(x_train), 'train examples')
print(len(x_test), 'test examples')
print(len(x_val), 'validation examples')

12800 train examples
4000 test examples
3200 validation examples


Build Model and Hyperparameter Tune

In [None]:
# Model builder function for hyperparameter tuning
def model_builder(hp):
  model = keras.Sequential()
  # model.add(keras.layers.Flatten(input_shape=(32, 280)))
  model.add(Embedding(input_dim=vocab_size, 
                      output_dim=hp.Int("output_dim:", min_value=40, max_value=120, step=10), input_length=max_length))

  model.add(LSTM(units=128))

  # Tune the number of Dense Layers
  # Tune the number of units in each layer - between 32-256
  # Tune the Dropout rate
  
  # hp_units = hp.Int('units', min_value=32, max_value=256, step=32)
  # model.add(keras.layers.Dense(units=hp_units, activation='relu'))
  

  for j in range(hp.Int("Dense Layers", min_value=1, max_value=5, step=1)):
        model.add(Dense(units=hp.Int("units_"+str(j), min_value=32, max_value=256, step=32),
            activation="relu", kernel_initializer=hp.Choice("kernel_init"+str(j), values=["he_uniform","he_normal"]))   
        )
        model.add(Dropout(rate=hp.Float("drop_rate"+str(j), min_value=0.1, max_value=0.5, step=0.1))
        )
  
  # Output layer
  model.add(Dense(6, activation="softmax"))

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  
  model.compile(optimizer=Adam(learning_rate=hp.Choice("learnRate", values=[0.01,0.001,0.0001])),
        loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )

  return model

In [None]:
# Instantiate the tuner and perform hypertuning
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='twitter_NLP')

In [None]:
tuner=kt.tuners.RandomSearch(
    model_builder,
    objective="val_accuracy",
    max_trials=2,
    executions_per_trial=2,
    directory="twitter_NLP2",
    project_name="hypertuningNLP"
    )

In [None]:
# Early Stopping
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
# Perforamnce Hyperparameter tuning
tuner.search(x_train, y_train, epochs=5, validation_data=(x_val, y_val), callbacks=[stop_early])

Trial 2 Complete [00h 20m 06s]
val_accuracy: 0.8690625131130219

Best val_accuracy So Far: 0.8770312368869781
Total elapsed time: 00h 42m 55s
INFO:tensorflow:Oracle triggered exit


In [None]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print("Dense Layers: ", best_hps.get("Dense Layers"))
print("drop_rate0: ", best_hps.get("drop_rate0"))
print("learnRate: ", best_hps.get("learnRate"))
print("units_0: ", best_hps.get("units_0"))

Dense Layers:  3
drop_rate0:  0.2
learnRate:  0.001
units_0:  32


Optimal Model

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(x_train, y_train, epochs=50, validation_data=(x_val, y_val), callbacks=[stop_early])

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Best epoch: 4


In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model with best number of epochs
hypermodel.fit(x_train, y_train, epochs=best_epoch, validation_data=(x_val, y_val), callbacks=[stop_early])

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fb42d710090>

In [None]:
eval_result = hypermodel.evaluate(x_test, y_test)



In [None]:
print(f"test loss: {eval_result[0]:.4f}")
print(f"test accuracy: {eval_result[1]:.4f}")

test loss: 0.4495
test accuracy: 0.8802


Predictions on new tweets

In [None]:
# Set up tweepy authorisation

api_key = "X"
api_key_secret = "X"
access_token = "X"
access_token_secret = "X"

auth = tw.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth,wait_on_rate_limit=True)

In [None]:
# Define the search term and the date_since date as variables
# Exlcuding retweets
search_words = "#WOLBRE -filter:retweets"
date_since = "2021-09-06"
count = 10

try:
  # Collect tweets
  tweets = tw.Cursor(api.search,
                q=search_words,
                lang="en",
                since=date_since).items(count)


  # Collect tweets, username, location and timestamp
  tweet_info = [[tweet.text, tweet.user.screen_name, tweet.user.location, tweet.created_at] for tweet in tweets]

  tweet_df = pd.DataFrame(data=tweet_info, 
                      columns=['tweet', 'user', 'location', 'timestamp'])
  
  tweet_df['timestamp'] = tweet_df['timestamp'].dt.strftime('%d-%m-%Y %H:%M:%S')


except BaseException as e:
  print('failed on_status,',str(e))
  time.sleep(3)

In [None]:
# Function for different twitter queries
def twitter_search(query_term):

  search_words = (query_term+" -filter:retweets -filter:replies")
  date_since = "2021-09-06"
  count = 10

  try:
    # Collect tweets
    tweets = tw.Cursor(api.search,
                  q=search_words,
                  lang="en",
                  since=date_since).items(count)


    # Collect tweets, username, location and timestamp
    tweet_info = [[tweet.text, tweet.user.screen_name, tweet.user.location, tweet.created_at] for tweet in tweets]

    tweet_df = pd.DataFrame(data=tweet_info, 
                        columns=['tweet', 'user', 'location', 'timestamp'])
    
    tweet_df['timestamp'] = tweet_df['timestamp'].dt.strftime('%d-%m-%Y %H:%M:%S')


  except BaseException as e:
    print('failed on_status,',str(e))
    time.sleep(3)

  return tweet_df

In [None]:
twitter_search('Benrahma')

Unnamed: 0,tweet,user,location,timestamp
0,"Hi there,\nDo you need modern and unique minimalist Logo design?\nPlease order here. https://t.co/ea6ayWzRyp… https://t.co/EbyZhnJq91",bishajikumar50,"Dhaka, Bangladesh",18-09-2021 12:29:45
1,Also Brentford brilliant to watch. Changed the system to a back 3 after being 433 for so long and let’s not forget… https://t.co/vlZ1lD314R,Higginbotham05,andy@tripleamedia.com,18-09-2021 12:28:36
2,Brentford sold Watkins and Benrahma for 60M and bought Ivan Toney who has already outscored both former Brentford f… https://t.co/6Vhpb3YYIu,CazorlaRoba,"Nairobi, Kenya",18-09-2021 12:20:28
3,8 players playing in FPL this week:\n\nTsimikas won't start\nGrealish may be rested\nBenrahma injured\nSemedo Wolves https://t.co/h5TPOF1awU,akshayt19nayak,Mumbai,18-09-2021 12:18:25
4,"West Ham take on Manchester United, Benrahma Injury and that night in Zagreb: https://t.co/3dgeLHtR0l",WHUNewsApp,Boleyn Ground,18-09-2021 12:17:34
5,Yeah and I benched him hopefully Benrahma can’t go https://t.co/lcm3NDPCSw,krishdatwani,🇮🇳🇬🇭,18-09-2021 12:15:02
6,Please be injured Benrahma. Please.,FPL_Bono,"England, United Kingdom",18-09-2021 12:11:42
7,My team tomorrow if Benrahma out:\nAreola \nDiop\nOgbonna \nZouma\nCoufal \nMasuaku\nSoucek\nRice\nBowen \nVlasic\nFornals,westhamonline5,,18-09-2021 12:11:41
8,"Hi there,\nDo you need modern and unique minimalist Logo design?\nPlease order here. https://t.co/ea6ayWzRyp… https://t.co/HODJrRK4gA",bishajikumar50,"Dhaka, Bangladesh",18-09-2021 12:11:10
9,"Sell Benrahma for a profit, buy back Toney at a discount?\n\nGaining 0.2 (maybe) while missing out on over 30 points.… https://t.co/H2PUg4adPF",WaltSaysStuff,,18-09-2021 12:09:51


Function which takes a single sentence and predicts the setiment

In [None]:
def predict_new_sentence(model):

  new_sentence = input("Enter a sentence: ")

  tweet_df = pd.DataFrame([new_sentence], columns=['tweet'])
  tweet_df["prediction"] = ""

  X = data_preparation(tweet_df, "tweet")

  prediction = model.predict(X)

  y_pred=[np.argmax(label) for label in prediction]

  predict=pd.DataFrame(y_pred, columns=["Predicted"])

  predict["Predicted Label"]=predict["Predicted"].map(
      {0:"joy", 1:"sadness", 2:"anger", 3:"fear", 4:"love", 5:"surprise"})

  predict_df=pd.concat([tweet_df["tweet"], predict["Predicted Label"]],
                    axis=1)
  
  return predict_df

In [None]:
predict_new_sentence(model)

Enter a sentence: He feels like green and blue are just so happy and kind of brilliant.


Unnamed: 0,tweet,Predicted Label
0,He feels like green and blue are just so happy and kind of brilliant.,joy


Function which takes a dataframe of tweets and predicts the setiment for each tweet

In [None]:
def predict_tweets(model, dataframe):
  X = data_preparation(tweet_df, "tweet")

  predictions = []

  for tweet in X:
    
    prediction = model.predict(X)

    y_pred=[np.argmax(label) for label in prediction]

    predict=pd.DataFrame(y_pred, columns=["Predicted"])

    predict["Predicted Label"] =  predict["Predicted"].map(
        {0:"joy", 1:"sadness", 2:"anger", 3:"fear", 4:"love", 5:"surprise"})


  predict_df=pd.concat([tweet_df["tweet"], predict["Predicted Label"]],
                  axis=1)
  
  return predict_df

In [None]:
predict_tweets(model, tweet_df)

Unnamed: 0,tweet,Predicted Label
0,Who’s a better striker👇🏿\n❤️ for Ivan Toney\n🔃 for Anthony Martial #WOLBRE https://t.co/c0IFmzXpMw,joy
1,My Prediction \n\nFT: Wolves 3 - 3 Brentford \n\n#WOLBRE,anger
2,when you think your life is bad\n\njust think about those who bought Raul Jimenez in FPL… #WOLBRE https://t.co/mjVAlG7xkq,love
3,"Against #MUFC, #Wolves played like they would be earning 15pts in that game, they gave everything with high tempo f… https://t.co/Ok5Xr5TXzp",joy
4,Ivan Toney is on 🔥🔥! \nGet in there....\n#FPL #WOLBRE @BigManBakar,anger
5,Mastering Chelsea’s system isn’t easy or wolves are shit take ur pick #WOLBRE,sadness
6,Shit at the back. Shit in front of goal.\n\n🐺🐝 #WOLBRE #WWFC,anger
7,"Okay, Brentford are def legit. #WOLBRE",anger
8,Jimenez is wasteful #WOLBRE,anger
9,Brentford look like the real deal #WOLBRE,anger


In [None]:
# Combining functions to be able to search for a specific term before sentiment analysis

def search_and_predict_tweets(query, model):

  tweet_df = twitter_search(query)
  X = data_preparation(tweet_df, "tweet")

  predictions = []

  for tweet in X:
    
    prediction = model.predict(X)

    y_pred=[np.argmax(label) for label in prediction]

    predict=pd.DataFrame(y_pred, columns=["Predicted"])

    predict["Predicted Label"] =  predict["Predicted"].map(
        {0:"joy", 1:"sadness", 2:"anger", 3:"fear", 4:"love", 5:"surprise"})


  predict_df=pd.concat([tweet_df["tweet"], predict["Predicted Label"]],
                  axis=1)
  
  return predict_df

In [None]:
search_and_predict_tweets('Wolves', model)

Unnamed: 0,tweet,Predicted Label
0,Wolves have failed to have a shot on target in a Premier League game at Molineux for just the third time since the… https://t.co/ddJPhHWlzo,joy
1,wolves? whoa,sadness
2,Wolves need three unanswered goals in the second half for our first 3u bet to lose.. How are we looking? 👀,joy
3,Mbeumo makes it 2-0 to Brentford against Wolves https://t.co/K3zhd2tEwW https://t.co/jABsaHqdQw,joy
4,First Winter Weather Advisory for Alaska for its 2021-22 Snow season #AKwx #USwx https://t.co/KJh1rP6AJL,love
5,“This is the week the Wolves will turn it around. They can’t keep out-chancing their opponents and losing. They’ll… https://t.co/w0w9qGtzZq,anger
6,Wolves wtf??? Smh. .,sadness
7,Brentford deserving there lead at half-time against Wolves ⚽️,love
8,Who else doesn’t own any Wolves assets 🙋🏻‍♂️ 😉,sadness
9,Great 1st half. \n\n2-0 away to Wolves. \n\nMore of the same for the 2nd half please 🐝\n\nIvan Toney 🔥,joy
