In [None]:
import sys
import numpy
import sklearn
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional, LSTM, Embedding, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import re
from random import randint, choice

In [None]:
#Load the training data of covid-19 tweets
training_data = pd.read_csv("drive/My Drive/Colab Notebooks/526_files/project/Corona_NLP_train.csv" ,encoding='latin-1')
training_data = training_data[['OriginalTweet','Sentiment']]

In [None]:
#Load the test data of covid-19 tweets
test_data = pd.read_csv("drive/My Drive/Colab Notebooks/526_files/project/Corona_NLP_test.csv" ,encoding='latin-1')
test_data = test_data[['OriginalTweet','Sentiment']]

In [None]:
#Store all the tweets in a list from the training dataset
all_tweets = list(training_data.OriginalTweet.values)
corpus = [x for x in all_tweets]

In [None]:
#Method to clean data by removing hyperlinks
def clean_data(data):
  return re.sub(r"http\S+", "", data)

In [None]:
#Use the tokenizer to remove all punctuations and special characters from the data and convert the data to lower case. 
t = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', oov_token=None, document_count=0)

In [None]:
#Method used to tokenize the data
#For every word in a sentence- it is tokenized by the tokeizer and by removing hyperlinks, the index is found for each word 
#and consequetively the data set sentences are converted into token sequence 
def tokenize_data(data):  
  t.fit_on_texts(data)
  words = len(t.word_index) + 1  
  sequences = []  
  for line in data:
     line1 = clean_data(line)    
     token_list = t.texts_to_sequences([line1])[0]      
     for i in range(1, len(token_list)):
         seq = token_list[:i+1]         
         sequences.append(seq)            
  return sequences, words

In [None]:
#input_sequences : list of sentences
#words : number of tokens
input_sequences, words = tokenize_data(corpus)

In [None]:
#Number of tokens
print(words)

85199


In [None]:
#This method will add labels to each of the tokenized sentence
#The label is the next word in the sequence
def generate_labelled_sequences(sequences):
    max_seq_len = max([len(x) for x in sequences])   
    sequences = np.array(pad_sequences(sequences, maxlen = max_seq_len, padding = 'pre'))    
    pred, label = sequences[:,:-1],sequences[:,-1]    
    return pred, label, max_seq_len

In [None]:
#Predictors : the tokenized sentences
#label : next word in the sentence is the label
#max_seq_len : maximum length of a sentence
predictors, label, max_seq_len = generate_labelled_sequences(input_sequences)

In [None]:
#Create the LSTM model which feeds in the max_sequence_len and the total number of words
def create_model(max_sequence_len, total_words):
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))    
    model.add(LSTM(100))    
    model.add(Dropout(0.1))    
    model.add(Dense(total_words, activation='softmax'))    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    
    return model    

In [None]:
model = create_model(max_seq_len, words)
model.summary()
#The LSTM model thus created is used to fit on the generated predictors and labels
model.fit(predictors, label, epochs=100, verbose=5)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 64, 10)            851990    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 85199)             8605099   
Total params: 9,501,489
Trainable params: 9,501,489
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 

<tensorflow.python.keras.callbacks.History at 0x7f69b03a4dd8>

In [None]:
#model is saved so as to load it later
model.save('my_model_100.h5')

In [None]:
#load the LSTM model to generate tweets
from keras.models import load_model
model = load_model('drive/My Drive/Colab Notebooks/526_files/project/my_model_100.h5')

In [None]:
#generate a tweet based on the seed word/s entered by the user
def generate_tweet(seed, num_words, model, max_seq_len):
    count = 0
    while count in range(num_words):        
        token_list = t.texts_to_sequences([seed])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')        
        predicted = model.predict_classes(token_list, verbose=0) 
        if (predicted == 3 or predicted == 4 or predicted == 5 or predicted == 6 or predicted == 7):            
            rand_num = (choice([randint(0,2),randint(8,max_seq_len-1)]))
            predicted = np.array([rand_num])      
        output_word = ''        
        #print("predicted==" + str(predicted))      
        for word,index in t.word_index.items():                                  
          if index == predicted:                              
              output_word = word
              break
        seed = seed + " " + output_word
        count += 1
        #print(seed)
    return seed


In [None]:
#Function to accept the data and and generate a new tweet and compare with original tweets
def generate_new_tweets(data):
  new_generated_tweets = []
  for i,data in data.head(100).iterrows():
    cleaned_data = clean_data(data['OriginalTweet'])
    seed = cleaned_data.split(" ")[0:7]
    seed_text = " ".join(seed)  
    generated_tweet = generate_tweet(seed_text, 100, model, max_seq_len)  
    actual_tweet = clean_data(data['OriginalTweet'])    
    print("actual = " + str(actual_tweet))
    print("generated = " + str(generated_tweet))    
    new_generated_tweets.append(generated_tweet)
  return new_generated_tweets 

In [None]:
#test tweets : contains the generated tweet from the first 7 words of the test data
#this line also prints the actual tweet vs the generated tweet with seed from the same data
test_tweets = generate_new_tweets(test_data)

actual = TRENDING: New Yorkers encounter empty supermarket shelves (pictured, Wegmans in Brooklyn), sold-out online grocers (FoodKick, MaxDelivery) as #coronavirus-fearing shoppers stock up  
generated = TRENDING: New Yorkers encounter empty supermarket shelves in the uk of the coronavirus pandemic get a full time to be a new normal you can get a message to the grocery store the shelves has been a nightmare i have a list of the front line in the store food bank by the full of the way to get a few weeks to be a time coronavirus the life is a good idea  a lot of people who are going to be able to get a way to get food out of course to get a few weeks no one is a good idea  at
actual = When I couldn't find hand sanitizer at Fred Meyer, I turned to #Amazon. But $114.97 for a 2 pack of Purell??!!Check out how  #coronavirus concerns are driving up prices. 
generated = When I couldn't find hand sanitizer at the grocery store   during the coronavirus pandemic the world is a good idea prices ar

In [None]:
#train tweets : contains the generated tweet from the first 7 words of the training data
#this line also prints the actual tweet vs the generated tweet with seed from the same data
train_tweets = generate_new_tweets(training_data)

actual = @MeNyrbie @Phil_Gahan @Chrisitv  and  and 
generated = @MeNyrbie @Phil_Gahan @Chrisitv  and  and online shopping for a new normal people my account to get a maximum habit of course   my office is a great idea for the coronavirus pandemic during this time  from the coronavirus by the virus are you can be aware of the virus has been a victim of the virus i have a hotline on the website for the next few weeks the way to be able to get a account of the consumer during the pandemic we are offering a lawsuit in the us  the way to be able to get a account to the
actual = advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order
generated = advice Talk to your neighbours family to get to the supermarket to get a pharmacy your family for the coronavirus to get a message to the grocery store is a great idea co