## Import libraries

In [1]:
import pandas as pd
import numpy as np
from keras.layers import LSTM, InputLayer, Activation, Dropout, Dense, Bidirectional
from keras.layers.embeddings import Embedding
from keras.models import Model
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yeopu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yeopu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Preprocessing

#### Convert to Lowercase

In [2]:
def to_lower(df):
    df['headlines'] = df['headlines'].str.lower()
    
    return df

#### Stop Words Removal

In [3]:
def remove_stopwords(data):
    data['headlines'] = data['headlines'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
    return data

#### Punctuations Removal

In [4]:
def remove_tags(data):
    data['headlines'] = data['headlines'].str.replace('[{}]'.format(string.punctuation), ' ')
    
    return data

#### Transform Target Variable  

In [5]:
def transform(data):
    for i in data.index:
        if data['sentiment'][i] == 1:
            data['sentiment'][i] = 2
        
        elif data['sentiment'][i] == 0:
            data['sentiment'][i] = 1
        
        else:
            data['sentiment'][i] = 0
    
    return data

## Tokenization

In [6]:
def tokenize(X):
    
    X_tokens = []
    
    for i in range(len(X)):
        tokens = nltk.word_tokenize(X[i])
        X_tokens.append(tokens)
        
    return X_tokens

## GloVe Embedding

In [7]:
def load_glove():
    
    glove_vectors = dict()
    file = open('glove.6B.100d.txt',  encoding='UTF-8')

    for line in file:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], dtype='float32')
        glove_vectors[word] = vectors
    file.close()
    
    return glove_vectors

In [8]:
def embed(data):
    
    glove_vectors = load_glove()
    dim = glove_vectors["random"].shape[0]
    
    X = np.zeros((len(data), 64, dim))
    

    for i in range(len(data)):
        for j in range (len(data[i])):
            try:
                vector = glove_vectors.get(data[i][j])
            except KeyError:
                vector = glove_vectors.get("<unk>")
                X[i][j] = np.array(vector)
            
            if vector is not None:
                X[i][j] = np.array(vector)
                
    return X

In [9]:
def prepare(data):
    data = to_lower(data)
    data = remove_stopwords(data)
    data = remove_tags(data)
    data = transform(data)
    
    X = []
    for i in range(len(data['headlines'])):
        X.append(data['headlines'][i])
    
    Y = np.array(list(data['sentiment']))
    Y = to_categorical(Y)

    X = tokenize(X)
    X = embed(X)
    
    return X, Y

In [12]:
test = pd.read_csv('sentiment/test.csv').set_axis(['headlines', 'sentiment'], axis=1, inplace=False)
X_test, Y_test = prepare(test)

  data['headlines'] = data['headlines'].str.replace('[{}]'.format(string.punctuation), ' ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment'][i] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment'][i] = 1


In [13]:
X_test

array([[[-0.42151999, -0.49518999,  0.09943   , ..., -0.36506   ,
         -0.098772  ,  0.60898   ],
        [ 0.61084998, -0.52956003, -0.59061998, ..., -0.91993999,
          0.39991   ,  0.083406  ],
        [-0.27063999,  0.0051896 ,  0.1497    , ..., -0.23097999,
          0.54587001,  0.49992001],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[-0.024221  , -0.034855  ,  0.35710001, ..., -0.087568  ,
          0.25961   ,  0.050783  ],
        [ 0.36204001,  0.43627   ,  0.10537   , ...,  0.19543   ,
          0.37797001,  0.40605   ],
        [ 0.11945   ,  0.41922   , -0.04461   , ...,  0.17206   ,
          0.17687   , -0.44743001],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

In [14]:
Y_test

array([[0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

In [15]:
model = tf.keras.models.load_model('BiLSTM')

In [16]:
model.evaluate(X_test, Y_test)



[0.8584498763084412, 0.63163161277771]

## Model Inference (Single Headline)

In [10]:
def preprocess(headline):
    headline = headline.lower()
    headline = ' '.join([word for word in headline.split() if word not in stopwords])
    headline = re.sub(r'[^\w\s]','',headline)
    headline_tokens = nltk.word_tokenize(headline)

    glove_vectors = load_glove()
    dim = glove_vectors["random"].shape[0]
    
    X = np.zeros((64, 100))
    

    for i in range(len(headline_tokens)):
        try:
            vector = glove_vectors.get(headline_tokens[i])
        
        except KeyError:
            vector = glove_vectors.get("<unk>")
            X[i] = np.array(vector)
        
        if vector is not None:
            X[i] = np.array(vector)
                
    X = np.reshape(X, (-1,64,100))
    
    return X

In [11]:
def get_prediction(headline):
    
    X = preprocess(headline)
    model = tf.keras.models.load_model('BiLSTM')
    pred = model.predict(X)
    
    if np.argmax(pred) == 0:
        result = "Neutral"
        
    elif np.argmax(pred) == 1:
        result = "Positive"
        
    else:
        result = "Negative"
    
    return result

In [17]:
headline = "Government allocates RM10m to revive retail sector, says PM"
sentiment = get_prediction(headline)

print(sentiment)

Positive
