In [1]:
import numpy as np
import json
import pandas as pd
from scipy import interpolate
#from gensim.models import KeyedVectors
import pickle
import re

from nltk.corpus import stopwords
from nltk import word_tokenize

from nltk import pos_tag
from string import punctuation,digits
import os
import pickle

from sklearn import preprocessing

import tensorflow as tf

from scipy.interpolate import interp1d

from keras.utils import to_categorical 


Using TensorFlow backend.


In [2]:
def remove_punctuation(s):
    list_punctuation = list(punctuation)
    for i in list_punctuation:
        s = s.replace(i,'')
    return s

def clean_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'(\W)\1{2,}', r'\1', sentence) 
    sentence = re.sub(r'(\w)\1{2,}', r'\1\1', sentence)
    sentence = re.sub(r'(?P<url>https?://[^\s]+)', r'', sentence)
    sentence = re.sub(r"\@(\w+)", "", sentence)
    sentence = sentence.replace('#',' ')
    sentence = sentence.replace("'s",' ')
    sentence = sentence.replace("-",' ')
    tokens = sentence.split()
    tokens = [remove_punctuation(w) for w in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    remove_digits = str.maketrans('', '', digits)
    tokens = [w.translate(remove_digits) for w in tokens]
    tokens = [w.strip() for w in tokens]
    tokens = [w for w in tokens if w!=""]
    tokens = ' '.join(tokens)
    return tokens

In [52]:
def load_google_word2vec(file_name):
    return KeyedVectors.load_word2vec_format(file_name, binary=True)


def build_embedding_matrix(vocab_size, embed_dim,tokenizer):
    
    embedding_matrix_file_name='Finance_embedding_matrix_1.dat'
    if os.path.exists(embedding_matrix_file_name):
        print('loading embedding_matrix:', embedding_matrix_file_name)
        embedding_matrix = pickle.load(open(embedding_matrix_file_name, 'rb'))
    else:
        print('loading word vectors...')
        fname = 'D:\Jupyter notebooks\Word Embeddings\GoogleNews-vectors-negative300.bin' 
        model=load_google_word2vec(fname)
        embedding_matrix = np.zeros((vocab_size, embed_dim))
        for word, i in tokenizer.word_index.items():
            try:
                embedding_vector = model[word]
            except KeyError:
                embedding_vector = None
            if embedding_vector is not None:
                embedding_matrix[i]=embedding_vector
        
        pickle.dump(embedding_matrix, open(embedding_matrix_file_name, 'wb'))

    return embedding_matrix



In [91]:

def prepare_data(fname):
        with open(fname, encoding='utf-8') as f:
            foo = json.load(f)
            sentence_l=[]
            target_l=[]
            aspect_l=[]
            sentiment_l=[]
            for key in foo.keys():
                for info in foo[key]['info']:
                    sentence=foo[key]['sentence']
                                #print(sentence)
                    sentence = [clean_sentence(x) for x in sentence.split(" ")]
                                #print(sentence)
                    sentence=' '.join(sentence)
                                #print(sentence)
                    target= info['target'].lower()
                    
                    sentiment_score = info['sentiment_score']
                                    #print(sentiment_score)
                    aspect= info['aspects']
                                    #print("Aspect "+ aspect)
                                    #sentiment_score = rescale(sentiment_score,[-1,1],[0,1])
                                    #print(sentiment_score)

                    
                    sentence=re.sub(' +', ' ',sentence)
                    sentence=sentence.strip()
                    
                    sentence_l.append(sentence)
                    target_l.append(target)
                    sentiment_l.append(sentiment_score)
                    aspect_l.append(aspect)
            




            
        return sentence_l,target_l,sentiment_l,aspect_l

print("preparing Finance dataset...")
fname = {
            'finance': {
                'train': 'train_data.json',
                'test':  'test.json',
                'validation_test' : 'validation_test.json'
            }


        }
sentence,target,sentiment,aspect=prepare_data(fname['finance']['train'])


preparing Finance dataset...


In [92]:
v_sentence,v_target,v_sentiment,v_aspect=prepare_data(fname['finance']['validation_test'])

In [93]:
def rescale(series,old_range,new_range):
    m = interp1d(old_range,new_range)
    return [float(m(x)) for x in series]

In [94]:
sentiment = rescale(sentiment,[-1,1],[0,1])
v_sentiment = rescale(v_sentiment,[-1,1],[0,1])


In [95]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# encode a list of lines
def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [96]:
def  convert_lables (trainY):
    le = preprocessing.LabelEncoder()
    le.fit(trainY)
    temp1 = le.transform(trainY)
    return to_categorical(temp1,27),le.classes_

In [97]:
aspect_level1= []
aspect_level2=[]
for asp in aspect:
    try:
        asp=asp.lstrip("['")
        asp=asp.rstrip("']")

        l=asp.split("/")
        
        aspect_level1.append(l[0])
        aspect_level2.append(l[1])
    except:
        print(asp)


In [98]:
def get_level_pairs(l1,l2):
    level_pair = dict()
    for pair in zip(l1,l2):
        if pair[1] in level_pair.keys():
            level_pair[pair[1]].append(pair[0])
        else:
            level_pair[pair[1]] = [pair[0]]
    for _ in level_pair.keys():
        level_pair[_] = list(set(level_pair[_]))
    return level_pair
L2_L1_pair = get_level_pairs(aspect_level1,aspect_level2)
L1_L2_pair = get_level_pairs(aspect_level2,aspect_level1)

L1_L2_pair

{'Corporate': ['Technical Analysis',
  'Rumors',
  'Reputation',
  'Legal',
  'Sales',
  'M&A',
  'Regulatory',
  'Financial',
  'Dividend Policy',
  'Strategy',
  'Risks',
  'Company Communication',
  'Appointment'],
 'Economy': ['Central Banks', 'Trade'],
 'Market': ['Currency', 'Volatility', 'Conditions', 'Market'],
 'Stock': ['Technical Analysis',
  'IPO',
  'Coverage',
  'Buyside',
  'Options',
  'Price Action',
  'Fundamentals',
  'Insider Activity',
  'Signal']}

In [99]:
v_aspect[0][0].split('/')


['Stock', 'Coverage']

In [62]:
aspect_level1_v= []
aspect_level2_v=[]
for asp in v_aspect:
    try:
        l=asp[0].split("/")
        aspect_level1_v.append(l[0])
        aspect_level2_v.append(l[1])
    except:
        print(asp)

In [63]:
aspect_level2_v[:10]

['Coverage',
 'Price Action',
 'Price Action',
 'Insider Activity',
 'Insider Activity',
 'Financial',
 'Company Communication',
 'Market',
 'Signal',
 'Financial']

In [64]:
aspect_level2[:10]

['Price Action',
 'Price Action',
 'Price Action',
 'Appointment',
 'Strategy',
 'Price Action',
 'Market',
 'Financial',
 'Price Action',
 'Strategy']

In [65]:
n_label_level_1 = len(set(aspect_level1))
n_label_level_2 = len(set(aspect_level2))
n_label_level_1,n_label_level_2

(4, 27)

In [66]:
val_n_label_level_1 = len(set(aspect_level1_v))
val_n_label_level_2 = len(set(aspect_level2_v))
val_n_label_level_1,val_n_label_level_2

(5, 19)

In [67]:
len(aspect_level2)

1173

In [68]:
trainY,lable_encoding = convert_lables(aspect_level2)

In [69]:
testY,lable_encoding = convert_lables(aspect_level2_v)

In [100]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
dataX=sentence
tokenizer = create_tokenizer(dataX)
vocab_size = len(tokenizer.word_index) + 1


In [71]:
first=[]
for m in range(len(target)):
    j=target[m].split() 
    first.append(j[0])

In [101]:
max_length=11
i = encode_text(tokenizer, sentence, max_length)
t = encode_text(tokenizer, first, 1)
AS = encode_text(tokenizer, aspect, 2)
t_total=encode_text(tokenizer, target, 5)
VT_total=encode_text(tokenizer, v_target, 5)


In [102]:
i[10]

array([1478,  292,   38,   48,  445,    7,  293,    0,    0,    0,    0])

In [103]:
SENTENCE_I=i

In [74]:
t_total[:8]

array([[  14,    0,    0,    0,    0],
       [  84,    0,    0,    0,    0],
       [1461,    0,    0,    0,    0],
       [1462,    0,    0,    0,    0],
       [ 578,    0,    0,    0,    0],
       [ 579,    0,    0,    0,    0],
       [  95,    0,    0,    0,    0],
       [ 847,    0,    0,    0,    0]])

In [75]:
VT_total[:8]

array([[   0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0],
       [1247,    0,    0,    0,    0],
       [ 782,    0,    0,    0,    0],
       [ 110,    0,    0,    0,    0],
       [1238,    0,    0,    0,    0]])

In [76]:
sentiment= [float(x) for x in sentiment]

In [77]:
import numpy as np

t_=np.tile(t, 11)
t_[3]

array([1462, 1462, 1462, 1462, 1462, 1462, 1462, 1462, 1462, 1462, 1462])

In [78]:
first_vt=[]
for i in range(len(v_target)):
    j=v_target[i].split() 
    first_vt.append(j[0])

In [79]:
v_target[:10]

['dh',
 'shop',
 'plx',
 'coty',
 'ge',
 'ko',
 'glencore',
 'sky',
 'berkshire',
 'glaxo']

In [80]:
first_vt[:10]

['dh',
 'shop',
 'plx',
 'coty',
 'ge',
 'ko',
 'glencore',
 'sky',
 'berkshire',
 'glaxo']

In [81]:
v_i=encode_text(tokenizer, v_sentence, max_length)
v_t=encode_text(tokenizer, first_vt, 1)

In [82]:
v_t[:10]

array([[   0],
       [   0],
       [   0],
       [   0],
       [1247],
       [ 782],
       [ 110],
       [1238],
       [ 204],
       [1822]])

In [83]:
v_t_=np.tile(v_t,11)

v_sentiment= [float(x) for x in v_sentiment]

v_t_[:10]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1247, 1247, 1247, 1247, 1247, 1247, 1247, 1247, 1247, 1247, 1247],
       [ 782,  782,  782,  782,  782,  782,  782,  782,  782,  782,  782],
       [ 110,  110,  110,  110,  110,  110,  110,  110,  110,  110,  110],
       [1238, 1238, 1238, 1238, 1238, 1238, 1238, 1238, 1238, 1238, 1238],
       [ 204,  204,  204,  204,  204,  204,  204,  204,  204,  204,  204],
       [1822, 1822, 1822, 1822, 1822, 1822, 1822, 1822, 1822, 1822, 1822]])

In [84]:
embedding_matrix = build_embedding_matrix(vocab_size, 300,tokenizer)

loading embedding_matrix: Finance_embedding_matrix_1.dat


In [85]:
len(eval('embedding_matrix'))

3324

In [86]:
print(vocab_size)

3324


In [87]:
len(trainY[0])

27

In [104]:

all_data={
    
    'sentence' : SENTENCE_I,
    'target'   : t,
    'embedding_matrix' :embedding_matrix,
    'train_sentiment': np.array(sentiment),
    'v_sentence' : v_i,
    'v_target'   : v_t_,
    'v_sentiment': np.array(v_sentiment),
    'vocab_size' : vocab_size,
    'target_for_IAN' : t_total,
    'v_target_IAN' : VT_total,
    'aspect': trainY,
    'v_aspect':testY
    

}


pickle.dump(all_data, open(r"D:\Sentiment-Analysis\all_data.dat","wb"))
                

# Making the train , test , Validation_test_alike

In [None]:
'''
with open('train_data.json', encoding='utf-8') as f:
                foo = json.load(f)

                for key in foo.keys():
                    for info in foo[key]['info']:
                        sentence=foo[key]['sentence']
                        #print(info['target']+ " ## "+ sentence)
                        #print(sentence)
                        sentence = [clean_sentence(x) for x in sentence.split(" ")]
                        #print(sentence)
                        
                        sentence=' '.join(sentence)
                        #print(sentence)
                        
                        sentence=' '.join(sentence.split())
                        
                        target= info['target']
                        #print("Target = "+ target)
                        
                         
                        print(info['target'].lower()+ " ## "+ sentence)
                        text_left, _, text_right = [s.lower().strip() for s in sentence.partition(target)]
                        #print( text_left + text_right)
                        text_raw_indices =tokenizer.text_to_sequence(text_left + " " + target + " " + text_right)
                        #print(text_raw_indices)
                        
                        text_raw_without_target_indices = tokenizer.text_to_sequence(text_left + " " + text_right)
                        #print(text_raw_without_target_indices)
                        text_left_indices = tokenizer.text_to_sequence(text_left)
                        #print(text_left_indices)
                        text_left_with_target_indices = tokenizer.text_to_sequence(text_left + " " + target)
                        #print(text_left_with_target_indices)
                        text_right_indices = tokenizer.text_to_sequence(text_right, reverse=True)
                        #print( text_right_indices)
                        text_right_with_target_indices = tokenizer.text_to_sequence(" " + target + " " + text_right, reverse=True)
                        #print(text_right_with_target_indices)
                        target_indices = tokenizer.text_to_sequence(target)
                        #print(target_indices)
                        
                        #if data=='train' :
                        sentiment_score = info['sentiment_score']
                            #print(sentiment_score)
                        aspect= info['aspects']
                            #print("Aspect "+ aspect)
                            #sentiment_score = rescale(sentiment_score,[-1,1],[0,1])
                            #print(sentiment_score)



                        data = {
                                    'text_raw_indices': text_raw_indices,
                                    'text_raw_without_target_indices': text_raw_without_target_indices,
                                    'text_left_indices': text_left_indices,
                                    'text_left_with_target_indices': text_left_with_target_indices,
                                    'text_right_indices': text_right_indices,
                                    'text_right_with_target_indices': text_right_with_target_indices,
                                    'target_indices': target_indices,
                                    
                        }
                        
                        #if data=='train': 
                        data['polarity']= sentiment_score
                        
                       
 '''                       

In [None]:
'''
Combining test_headline and test_post

import json
def load_test_data():
    with open(r'D:\Sentiment-Analysis\test\task1_post_ABSA_test.json', encoding='utf-8') as f1:
        foo1 = json.load(f1)
        with open(r'D:\Sentiment-Analysis\test\task1_headline_ABSA_test.json', encoding='utf-8') as f2:
                foo2= json.load(f2)
                test = {**foo1, **foo2}
                data = json.dumps(test)
                with open(r"D:\Sentiment-Analysis\test.json","w") as f:
                    f.write(data)
                
#load_test_data()
'''



In [None]:
'''
import pandas as pd
import re
import json
def validation_headline():
    df = pd.read_csv(r'D:\Datasets\FinanceHeadlineDataset\gold_standard\test_headlines_samples - Sheet1.tsv', sep='\t')
    df.head()
    d={}

    for index, row in df.iterrows():

        target = re.sub('\d+',"", row["id"])
        target=target.lstrip("_")
        inner_d={}
        inner_d_1={}
        inner_d['info']=[]

        inner_d['sentence']=row['sentence']

        inner_d_1["target"]=target
        inner_d_1["sentiment_score"]=row["sentiment_scores"]
        aspect_h=row["aspect"].split("/")
        a=aspect_h[0]+"/"+aspect_h[1]
        aspect=[]
        aspect.append(a)
        inner_d_1["aspects"]=aspect
        inner_d['info'].append(inner_d_1)

        d[row["id"]]=inner_d
    return d

def validation_posts():
    df = pd.read_csv(r'D:\Datasets\FinanceHeadlineDataset\gold_standard\test_set_post - Sheet1.tsv', sep='\t')
    df.head()
    d={}

    for index, row in df.iterrows():

        target = re.sub('\d+',"", row["id"])
        target=target.lstrip("$")
        inner_d={}
        inner_d_1={}
        inner_d['info']=[]

        inner_d['sentence']=row['sentence']

        inner_d_1["target"]=target
        inner_d_1["sentiment_score"]=row["sentiment_score"]
        a=row["aspect_category_1"]+"/"+row["aspect_category_2"]
        aspect=[]
        aspect.append(a)
        inner_d_1["aspects"]=aspect
        inner_d['info'].append(inner_d_1)

        d[row["id"]]=inner_d
    return d
foo1=validation_headline()
foo2=validation_posts()
validation_test = {**foo1, **foo2}    
data = json.dumps(validation_test)
with open(r"D:\Sentiment-Analysis\validation_test.json","w") as f:
                    f.write(data)
                    
'''