# Task 2: Sentiment analysis of News Statements & Headlines

### Import necessary libraries

In [1]:
import json
import math
import string
import os
from pprint import pprint
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as pltfrom
from sklearn.metrics.pairwise import cosine_similarity

from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.corpus import stopwords 
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

Using TensorFlow backend.


{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

### If NLTK is not installed , download NLTK library

In [2]:
#if already downloaded this step is not required
#nltk.download()

### Helper functions
  1. Get stopwords from English and remove them from input data
  2. Remove punctuations and non-ascii characters from input data
  3. Get Avg no of words in one review 
  4. Calulating cosine similarity and cosine weight for similarity evaluation

In [3]:
stop = stopwords.words('english')
def removeStopWords(inputString):
    splitString = inputString.split()
    for word in stop:
        for s in splitString:
            if s == word:
                splitString.remove(s)
    return " ".join(splitString)



def stringClean( inputString ) :
    inputString = removeStopWords(inputString)
    return inputString.lower().translate(str.maketrans('','',string.punctuation)).encode("ascii", errors="ignore").decode()


def avgWords( x_train ) :
    reviewLength = 0
    for x in x_train:
        reviewLength = len(x.split())+ reviewLength
    return np.rint(reviewLength/ len(x_train)).astype(int)


### Create X_train and X_test
1. Load dataset from json file
2. Prepare X_train and X_test using data processing functions above
3. Convert Y_train in required format

In [4]:
with open("Headline_Trainingdata.json") as f:
    reviews_train = json.load(f)

with open("Headline_Trialdata.json") as f:
    reviews_trial = json.load(f)

with open("Headlines_Testdata.json", encoding="utf8") as f:
    reviews_test = json.load(f)

reviewLength = 0

X_train = []
#X_test = [stringClean(review['title']).replace(review['company'].lower(), "") for review in reviews_test]
Y_train = []

for review in reviews_train :
    X_train.append(stringClean(review['title']).replace(review['company'].lower(),""))
    Y_train.append(0 if review['sentiment']<0 else 1)
    
for review in reviews_trial :
    reviews_train.append(review)
    X_train.append(stringClean(review['title']).replace(review['company'].lower(),""))
    Y_train.append(0 if review['sentiment']<0 else 1)
    

In [5]:
reviewLength = avgWords(X_train)
#print("Avg Length of title:",reviewLength)
#print(X_train)

###  Tokenize the data and pad review if it has less than avg length

In [6]:
tokenizer = Tokenizer(num_words=reviewLength*50)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
data = pad_sequences(sequences, maxlen=(reviewLength))

### Build model
   1. Embedding layer , 1 LSTM and 1 Dense layer
   2. Dense layer has 'tanh' as activation function.
   3. tanh function is a sigmoidal (“s”-shaped), with outputs values in range (-1, 1).

In [7]:
model = Sequential()
model.add(Embedding(reviewLength*50, 16, input_length=reviewLength))
model.add(LSTM(16, dropout=0.35, recurrent_dropout=0.35))
#model.add(Dense(8, activation='linear',kernel_regularizer=regularizers.l2(0.5), activity_regularizer=regularizers.l1(0.5)))
#model.add(Dense(4, activation='linear'))
model.add(Dense(1, activation='tanh'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 7, 16)             5600      
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                2112      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 7,729
Trainable params: 7,729
Non-trainable params: 0
_________________________________________________________________


### Train model with validation split 0.25 and 25 epochs

In [8]:
history = model.fit(data, np.array(Y_train),validation_split=0.3, epochs=25 ,verbose = 2,shuffle = True)

#plot_training_curves(history.history);

Train on 809 samples, validate on 347 samples
Epoch 1/25
2s - loss: 1.9009 - acc: 0.3832 - val_loss: 1.1060 - val_acc: 0.4323
Epoch 2/25
0s - loss: 1.0121 - acc: 0.3832 - val_loss: 0.8265 - val_acc: 0.4323
Epoch 3/25
0s - loss: 0.7780 - acc: 0.4302 - val_loss: 0.7096 - val_acc: 0.4957
Epoch 4/25
0s - loss: 0.6974 - acc: 0.5624 - val_loss: 0.6951 - val_acc: 0.5562
Epoch 5/25
0s - loss: 0.6691 - acc: 0.5958 - val_loss: 0.6825 - val_acc: 0.5879
Epoch 6/25
0s - loss: 0.6583 - acc: 0.6106 - val_loss: 0.6685 - val_acc: 0.6052
Epoch 7/25
0s - loss: 0.6307 - acc: 0.6663 - val_loss: 0.6565 - val_acc: 0.6196
Epoch 8/25
0s - loss: 0.6068 - acc: 0.6885 - val_loss: 0.6415 - val_acc: 0.6369
Epoch 9/25
0s - loss: 0.5782 - acc: 0.7169 - val_loss: 0.6246 - val_acc: 0.6599
Epoch 10/25
0s - loss: 0.5433 - acc: 0.7565 - val_loss: 0.6098 - val_acc: 0.6686
Epoch 11/25
0s - loss: 0.5118 - acc: 0.7911 - val_loss: 0.5962 - val_acc: 0.6830
Epoch 12/25
0s - loss: 0.4819 - acc: 0.7701 - val_loss: 0.5891 - val_acc

In [9]:
np.random.seed(400)
temp = np.random.randint(0,len(reviews_test), size=100)
X_test = []
randomData = []
for i in temp:
    randomData.append(reviews_test[i])
    X_test.append(stringClean(reviews_test[i]['title']).replace(reviews_test[i]['company'].lower(), ""))


### Tokenize Testing data and predict sentiment scores for test data

In [10]:
sequences = tokenizer.texts_to_sequences(X_test)
data = pad_sequences(sequences, maxlen=reviewLength)
testPredictions = model.predict(data)

### Create a dataframe of results and create a csv file

In [12]:

testData = pd.DataFrame.from_dict(randomData)
testData['Predicted Sentiment Score'] = testPredictions
testData = testData[['id', 'company', 'title','Predicted Sentiment Score']]

if os.path.exists("TestPredictions.csv"):
    os.remove("TestPredictions.csv")

testData.to_csv("TestPredictions.csv", index = False)

In [13]:
testData

Unnamed: 0,id,company,title,Predicted Sentiment Score
0,1492,Shell,Shell's Arctic return faces hurdle at Seattle ...,-0.335834
1,1607,Diageo,Britain's FTSE recovers two-month low after Di...,0.290011
2,1462,Amazon,Jim Armitage: Amazon is Primed to deliver pain...,0.472891
3,1585,Standard Life,Industry NewsStandard Life enjoys strong inflo...,0.861770
4,1284,Standard Life,Standard Life Elevates,0.472891
5,1551,SABMiller,SABMiller revenue hit by weaker EM currencies,0.159084
6,1406,Johnson Matthey,Johnson Matthey share price slumps as company ...,0.384013
7,1389,AstraZeneca,Why AstraZeneca plc & Dixons Carphone PLC Are ...,0.934580
8,1280,Standard Life,Property Stocks Slump After Standard Life Free...,-0.123446
9,1251,Berkshire Hathaway Inc.,Berkshire holders hit Buffett with hard questions,0.174216
