## Task 1: Sentiment analysis of News Statements & Headlines

# import necessary libraries

In [1]:
import json
import math
import string
import os
from pprint import pprint
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as pltfrom
from sklearn.metrics.pairwise import cosine_similarity

from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.corpus import stopwords 
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

Using TensorFlow backend.


{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

In [2]:
#if already downloaded this step is not required
#nltk.download()

In [3]:
stop = stopwords.words('english')
def removeStopWords(inputString):
    splitString = inputString.split()
    for word in stop:
        for s in splitString:
            if s == word:
                splitString.remove(s)
    return " ".join(splitString)



def stringClean( inputString ) :
    inputString = removeStopWords(inputString)
    return inputString.lower().translate(str.maketrans('','',string.punctuation)).encode("ascii", errors="ignore").decode()


def maxWords( x_train ) :
    reviewLength = 0
    for x in x_train:
        if reviewLength < len(x.split()) :
            reviewLength = len(x.split())
    return reviewLength

# Load dataset from json file and extract X_train and Y_train data

In [4]:
with open("Headline_Trainingdata.json") as f:
    reviews_train = json.load(f)

with open("Headline_Trialdata.json") as f:
    reviews_trial = json.load(f)

with open("Headlines_Testdata.json", encoding="utf8") as f:
    reviews_test = json.load(f)

reviewLength = 0

X_train = []
X_test = [review['title'] for review in reviews_test]
output = []
Y_train = []

for review in reviews_train :
    X_train.append(stringClean(review['title'].replace(review['company'], "")))
    Y_train.append(0 if review['sentiment']<0 else 1)
    output.append(review['sentiment'])
    
for review in reviews_trial :
    reviews_train.append(review)
    X_train.append(stringClean(review['title'].replace(review['company'], "")))
    Y_train.append(0 if review['sentiment']<0 else 1)
    output.append(review['sentiment'])
    

In [5]:
reviewLength = maxWords(X_train)
#print("MAx Length of title:",reviewLength)
#print(X_train)

# Tokenize the data annd select 15,00 words

In [6]:
tokenizer = Tokenizer(num_words=reviewLength*20)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
data = pad_sequences(sequences, maxlen=(reviewLength))

# build model

In [7]:
model = Sequential()
model.add(Embedding(reviewLength*20, 16, input_length=reviewLength))
model.add(LSTM(16, dropout=0.35, recurrent_dropout=0.35))
#model.add(Dense(8, activation='linear',kernel_regularizer=regularizers.l2(0.5), activity_regularizer=regularizers.l1(0.5)))
#model.add(Dense(4, activation='linear'))
model.add(Dense(1, activation='tanh'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 14, 16)            4480      
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                2112      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 6,609
Trainable params: 6,609
Non-trainable params: 0
_________________________________________________________________


# train model with training and trial data

In [8]:


history = model.fit(data, np.array(Y_train),validation_split=0.4, epochs=35,verbose = 2,shuffle = True)

#plot_training_curves(history.history);

Train on 693 samples, validate on 463 samples
Epoch 1/35
1s - loss: 2.3686 - acc: 0.3867 - val_loss: 1.2285 - val_acc: 0.4147
Epoch 2/35
0s - loss: 1.0917 - acc: 0.3867 - val_loss: 0.8673 - val_acc: 0.4147
Epoch 3/35
0s - loss: 0.7942 - acc: 0.4488 - val_loss: 0.6933 - val_acc: 0.5378
Epoch 4/35
0s - loss: 0.7190 - acc: 0.5354 - val_loss: 0.6860 - val_acc: 0.5875
Epoch 5/35
0s - loss: 0.7060 - acc: 0.5772 - val_loss: 0.6812 - val_acc: 0.5983
Epoch 6/35
0s - loss: 0.7069 - acc: 0.5267 - val_loss: 0.6795 - val_acc: 0.5853
Epoch 7/35
0s - loss: 0.6797 - acc: 0.5786 - val_loss: 0.6731 - val_acc: 0.6048
Epoch 8/35
0s - loss: 0.6844 - acc: 0.5714 - val_loss: 0.6686 - val_acc: 0.6242
Epoch 9/35
0s - loss: 0.6445 - acc: 0.6075 - val_loss: 0.6644 - val_acc: 0.6177
Epoch 10/35
0s - loss: 0.6435 - acc: 0.6320 - val_loss: 0.6586 - val_acc: 0.6263
Epoch 11/35
0s - loss: 0.6546 - acc: 0.6162 - val_loss: 0.6529 - val_acc: 0.6285
Epoch 12/35
0s - loss: 0.6098 - acc: 0.6609 - val_loss: 0.6477 - val_acc

# Predict sentiment score for given data

In [9]:
sequences = tokenizer.texts_to_sequences(X_train)
data = pad_sequences(sequences, maxlen=reviewLength)
predictions = model.predict(data)

# Calculate cosine similarity

In [10]:
predictedVals = [pr[0] for pr in predictions ]

output = np.array(output).reshape(1,-1)
predictedVals = np.array(predictedVals).reshape(1,-1)

similarity = cosine_similarity(output,predictedVals) 



print("Cosine Similarity Score of the model : ",similarity[0][0])

Cosine Similarity Score of the model :  0.368691355317


# Convert results to a csv file

In [11]:
df = pd.DataFrame.from_dict(reviews_train)
df['Predicted Sentiment Score'] = predictedVals[0]
df = df[['id', 'company', 'title','sentiment','Predicted Sentiment Score']]

if os.path.exists("Task2.csv"):
    os.remove("Task2.csv")

df.to_csv('Task2.csv', index = False)

In [12]:
df

Unnamed: 0,id,company,title,sentiment,Predicted Sentiment Score
0,2,Morrisons,Morrisons book second consecutive quarter of s...,0.430,0.741555
1,3,IMI,IMI posts drop in first-quarter organic revenu...,-0.344,-0.416243
2,4,Glencore,Glencore to refinance its short-term debt earl...,0.340,0.984908
3,5,Ryanair,EasyJet attracts more passengers in June but s...,0.259,0.684528
4,6,Barclays,Barclays 'bad bank' chief to step down,-0.231,0.459851
5,7,BP,Bilfinger Industrial Services win Ã‚Â£100m BP ...,0.113,0.827383
6,8,Bilfinger Industrial Services,Bilfinger Industrial Services win Ã‚Â£100m BP ...,0.424,0.918498
7,9,Barclays,Barclays share price subdued as bank faces fre...,-0.373,-0.327760
8,10,Centrica PLC,Centrica prepared for takeover approach - chai...,0.111,0.970765
9,11,Glencore,Jim Armitage: Spare no tears as Glencore's bos...,-0.231,0.308595


In [16]:

Pnorm = np.linalg.norm(predictedVals)
Gnorm = np.linalg.norm(output)
wt = Pnorm/Gnorm
print("PNorm::",Pnorm)
print("GNorm::",Gnorm)
print("Cosine Weight:",wt)
print("Final Cosine:", wt * similarity[0][0])

PNorm:: 24.1511
GNorm:: 13.607812315
Cosine Weight: 1.77479976658
Final Cosine: 0.654353331358
