# Task 2: Sentiment analysis of News Statements & Headlines

### Import necessary libraries

In [1]:
import json
import math
import string
import os
from pprint import pprint
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as pltfrom
from sklearn.metrics.pairwise import cosine_similarity

from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.corpus import stopwords 
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

Using TensorFlow backend.


{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

### If NLTK is not installed , download NLTK library

In [2]:
#if already downloaded this step is not required
#nltk.download()

### Helper functions
  1. Get stopwords from English and remove them from input data
  2. Remove punctuations and non-ascii characters from input data
  3. Get Avg no of words in one review 
  4. Calulating cosine similarity and cosine weight for similarity evaluation

In [3]:
stop = stopwords.words('english')
def removeStopWords(inputString):
    splitString = inputString.split()
    for word in stop:
        for s in splitString:
            if s == word:
                splitString.remove(s)
    return " ".join(splitString)



def stringClean( inputString ) :
    inputString = removeStopWords(inputString)
    return inputString.lower().translate(str.maketrans('','',string.punctuation)).encode("ascii", errors="ignore").decode()


def calculateCosineSimilarity( pVal,gVal ):
    similarity = cosine_similarity(pVal,gVal)
    pNorm = np.linalg.norm(pVal)
    gNorm = np.linalg.norm(gVal)
    wt = pNorm/gNorm
    return wt * similarity[0][0]
    

def avgWords( x_train ) :
    reviewLength = 0
    for x in x_train:
        reviewLength = len(x.split())+ reviewLength
    return np.rint(reviewLength/ len(x_train)).astype(int)


### Create X_train and X_test
1. Load dataset from json file
2. Prepare X_train and X_test using data processing functions above
3. Convert Y_train in required format

In [4]:
with open("Headline_Trainingdata.json") as f:
    reviews_train = json.load(f)

with open("Headline_Trialdata.json") as f:
    reviews_trial = json.load(f)

with open("Headlines_Testdata.json", encoding="utf8") as f:
    reviews_test = json.load(f)

reviewLength = 0

X_train = []
X_test = [stringClean(review['title']).replace(review['company'].lower(), "") for review in reviews_test]
output = []
Y_train = []

for review in reviews_train :
    X_train.append(stringClean(review['title']).replace(review['company'].lower(),""))
    Y_train.append(0 if review['sentiment']<0 else 1)
    output.append(review['sentiment'])
    
for review in reviews_trial :
    reviews_train.append(review)
    X_train.append(stringClean(review['title']).replace(review['company'].lower(),""))
    Y_train.append(0 if review['sentiment']<0 else 1)
    output.append(review['sentiment'])
    

In [5]:
reviewLength = avgWords(X_train)
#print("Avg Length of title:",reviewLength)
#print(X_train)

###  Tokenize the data and pad review if it has less than avg length

In [6]:
tokenizer = Tokenizer(num_words=reviewLength*50)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
data = pad_sequences(sequences, maxlen=(reviewLength))

### Build model
   1. Embedding layer , 1 LSTM and 1 Dense layer
   2. Dense layer has 'tanh' as activation function.
   3. tanh function is a sigmoidal (“s”-shaped), with outputs values in range (-1, 1).

In [7]:
model = Sequential()
model.add(Embedding(reviewLength*50, 16, input_length=reviewLength))
model.add(LSTM(16, dropout=0.35, recurrent_dropout=0.35))
#model.add(Dense(8, activation='linear',kernel_regularizer=regularizers.l2(0.5), activity_regularizer=regularizers.l1(0.5)))
#model.add(Dense(4, activation='linear'))
model.add(Dense(1, activation='tanh'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 7, 16)             5600      
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                2112      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 7,729
Trainable params: 7,729
Non-trainable params: 0
_________________________________________________________________


### Train model with validation split 0.25 and 25 epochs

In [8]:
history = model.fit(data, np.array(Y_train),validation_split=0.3, epochs=25 ,verbose = 2,shuffle = True)

#plot_training_curves(history.history);

Train on 809 samples, validate on 347 samples
Epoch 1/25
2s - loss: 2.0434 - acc: 0.3832 - val_loss: 1.2780 - val_acc: 0.4323
Epoch 2/25
0s - loss: 1.2030 - acc: 0.3832 - val_loss: 0.9946 - val_acc: 0.4323
Epoch 3/25
0s - loss: 0.9346 - acc: 0.3857 - val_loss: 0.8005 - val_acc: 0.4323
Epoch 4/25
0s - loss: 0.7549 - acc: 0.4710 - val_loss: 0.7121 - val_acc: 0.4986
Epoch 5/25
0s - loss: 0.6972 - acc: 0.5612 - val_loss: 0.6988 - val_acc: 0.5447
Epoch 6/25
0s - loss: 0.6736 - acc: 0.6119 - val_loss: 0.6872 - val_acc: 0.5648
Epoch 7/25
0s - loss: 0.6538 - acc: 0.6242 - val_loss: 0.6736 - val_acc: 0.5821
Epoch 8/25
0s - loss: 0.6334 - acc: 0.6366 - val_loss: 0.6631 - val_acc: 0.6081
Epoch 9/25
0s - loss: 0.6090 - acc: 0.6848 - val_loss: 0.6490 - val_acc: 0.6340
Epoch 10/25
0s - loss: 0.5802 - acc: 0.7268 - val_loss: 0.6367 - val_acc: 0.6455
Epoch 11/25
0s - loss: 0.5472 - acc: 0.7466 - val_loss: 0.6211 - val_acc: 0.6628
Epoch 12/25
0s - loss: 0.5281 - acc: 0.7701 - val_loss: 0.6077 - val_acc

### Predict sentiment scores for given training data

In [9]:
sequences = tokenizer.texts_to_sequences(X_train)
data = pad_sequences(sequences, maxlen=reviewLength)
predictions = model.predict(data)

### Calculate cosine similarity 

In [10]:
predictedVals = [pr[0] for pr in predictions ]
output = np.array(output).reshape(1,-1)
predictedVals = np.array(predictedVals).reshape(1,-1)


print("Final cosine score :",calculateCosineSimilarity(predictedVals,output))

Final cosine score : 0.693287894111


### Create a dataframe of results and create a csv file

In [11]:
df = pd.DataFrame.from_dict(reviews_train)
df['Predicted Sentiment Score'] = predictedVals[0]
df = df[['id', 'company', 'title','sentiment','Predicted Sentiment Score']]

if os.path.exists("Task2.csv"):
    os.remove("Task2.csv")

df.to_csv('Task2.csv', index = False)

In [12]:
df

Unnamed: 0,id,company,title,sentiment,Predicted Sentiment Score
0,2,Morrisons,Morrisons book second consecutive quarter of s...,0.430,0.464543
1,3,IMI,IMI posts drop in first-quarter organic revenu...,-0.344,-0.255118
2,4,Glencore,Glencore to refinance its short-term debt earl...,0.340,0.978871
3,5,Ryanair,EasyJet attracts more passengers in June but s...,0.259,0.734458
4,6,Barclays,Barclays 'bad bank' chief to step down,-0.231,-0.109057
5,7,BP,Bilfinger Industrial Services win Ã‚Â£100m BP ...,0.113,0.674402
6,8,Bilfinger Industrial Services,Bilfinger Industrial Services win Ã‚Â£100m BP ...,0.424,0.744841
7,9,Barclays,Barclays share price subdued as bank faces fre...,-0.373,-0.099029
8,10,Centrica PLC,Centrica prepared for takeover approach - chai...,0.111,0.957207
9,11,Glencore,Jim Armitage: Spare no tears as Glencore's bos...,-0.231,0.188092


### Tokenize Testing data and predict sentiment scores for test data

In [13]:
sequences = tokenizer.texts_to_sequences(X_test)
data = pad_sequences(sequences, maxlen=reviewLength)
testPredictions = model.predict(data)

### Create a dataframe of results and create a csv file

In [14]:
testData = pd.DataFrame.from_dict(reviews_test)
testData['Predicted Sentiment Score'] = testPredictions
testData = testData[['id', 'company', 'title','Predicted Sentiment Score']]

if os.path.exists("TestPredictions_Task2.csv"):
    os.remove("TestPredictions_Task2.csv")

testData.to_csv("TestPredictions_Task2.csv", index = False)

In [15]:
testData

Unnamed: 0,id,company,title,Predicted Sentiment Score
0,1144,Ashtead,"Ashtead to buy back shares, full-year profit b...",0.999726
1,1145,Shell,EU regulators clear Shell's takeover of BG Group,0.993910
2,1146,Prudential,UK's FTSE has worst day so far in 2015 as BG a...,-0.216989
3,1147,GlaxoSmithKline,GlaxoSmithKline acquires HIV assets,0.930236
4,1148,Barclays,Barclays faces another heavy forex fine,-0.291009
5,1149,Diageo,Diageo Shares Surge on Report of Possible Take...,0.884915
6,1150,Borealis Infrastructure,Borealis Infrastructure putting together new S...,0.775076
7,1151,Burberry Group plc,FTSE 100 falls as China devaluation hits Burbe...,0.821193
8,1152,Deutsche Boerse,London Stock Exchange â€“ Deutsche Boerse merg...,0.307793
9,1153,Tesco PLC,Tesco Abandons Video-Streaming Ambitions in Bl...,0.456814
