In [1]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string

In [54]:

data = pd.read_csv('global_headlines_df.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,date,articleid,headline,compound_vader_score
0,0,1991-11-14 00:00:00+00:00,wsj_398284048,Banking Bill Negotiators Set Compromise --- Pl...,0.296
1,1,1986-06-16 00:00:00+00:00,wsj_397959018,Manager's Journal: Sniffing Out Drug Abusers I...,-0.7003
2,2,2001-05-24 00:00:00+00:00,wsj_398739166,"Bank of Montreal, Royal Bank Profits Rose in 2...",0.4404
3,3,1986-10-22 00:00:00+00:00,wsj_397957465,Battle Over Medical Costs Isn't Over,-0.3818
4,4,2005-12-08 00:00:00+00:00,wsj_399004010,"Dow Falls 45.95, Late GM Surge Stanches Losses",-0.4019


In [55]:
data = data.loc[:,['headline']]

In [57]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

cs = []
for row in range(len(data)):
    cs.append(analyzer.polarity_scores(data['headline'].iloc[row])['compound'])

data['compound_vader_score'] = cs
data = data[(data[['compound_vader_score']] != 0).all(axis=1)].reset_index(drop=True)

data

[nltk_data] Downloading package vader_lexicon to C:\Users\Saurabh
[nltk_data]     Kamal\AppData\Roaming\nltk_data...


Unnamed: 0,headline,compound_vader_score
0,Banking Bill Negotiators Set Compromise --- Pl...,0.2960
1,Manager's Journal: Sniffing Out Drug Abusers I...,-0.7003
2,"Bank of Montreal, Royal Bank Profits Rose in 2...",0.4404
3,Battle Over Medical Costs Isn't Over,-0.3818
4,"Dow Falls 45.95, Late GM Surge Stanches Losses",-0.4019
...,...,...
4841,Stocks Rise for Third Straight Session: Better...,0.7579
4842,"Sawyer Sees Strong Economy For 2 Years, Truce ...",0.5106
4843,Oil's losses are airlines' gains,-0.0772
4844,Full Senate to vote on Bernanke; PANEL ADVANCE...,-0.3612


In [58]:
data.loc[data['compound_vader_score'] < 0, 'sentiment_class'] = '0'
data.loc[data['compound_vader_score'] > 0, 'sentiment_class'] = '1'

In [59]:
data.dtypes

headline                 object
compound_vader_score    float64
sentiment_class          object
dtype: object

In [60]:
data['sentiment_class'] = data['sentiment_class'].astype('int64')

In [61]:
data.dtypes

headline                 object
compound_vader_score    float64
sentiment_class           int64
dtype: object

In [63]:
data.drop(['compound_vader_score'],axis=1,inplace=True)

In [64]:
data.head()

Unnamed: 0,headline,sentiment_class
0,Banking Bill Negotiators Set Compromise --- Pl...,1
1,Manager's Journal: Sniffing Out Drug Abusers I...,0
2,"Bank of Montreal, Royal Bank Profits Rose in 2...",1
3,Battle Over Medical Costs Isn't Over,0
4,"Dow Falls 45.95, Late GM Surge Stanches Losses",0


In [65]:

data.sentiment_class.unique()

array([1, 0], dtype=int64)

In [66]:
data.shape

(4846, 2)

In [68]:
pos = []
neg = []
for l in data.sentiment_class:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)

In [69]:
data['Pos']= pos
data['Neg']= neg

In [70]:
data.head()

Unnamed: 0,headline,sentiment_class,Pos,Neg
0,Banking Bill Negotiators Set Compromise --- Pl...,1,1,0
1,Manager's Journal: Sniffing Out Drug Abusers I...,0,0,1
2,"Bank of Montreal, Royal Bank Profits Rose in 2...",1,1,0
3,Battle Over Medical Costs Isn't Over,0,0,1
4,"Dow Falls 45.95, Late GM Surge Stanches Losses",0,0,1


In [71]:
def remove_punct(headline):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', headline)
    return text_nopunct

data['Text_Clean'] = data['headline'].apply(lambda x: remove_punct(x))

In [72]:
from nltk import word_tokenize, WordNetLemmatizer
tokens = [word_tokenize(sen) for sen in data.Text_Clean]

In [73]:
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens]

In [74]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

In [75]:
def remove_stop_words(tokens): 
    return [word for word in tokens if word not in stoplist]

In [76]:
filtered_words = [remove_stop_words(sen) for sen in lower_tokens]

In [77]:
result = [' '.join(sen) for sen in filtered_words]

In [78]:
data['Text_Final'] = result

In [79]:
data['tokens'] = filtered_words

In [81]:
data = data[['Text_Final', 'tokens', 'sentiment_class', 'Pos', 'Neg']]

In [82]:
data[:4]

Unnamed: 0,Text_Final,tokens,sentiment_class,Pos,Neg
0,banking bill negotiators set compromise plan w...,"[banking, bill, negotiators, set, compromise, ...",1,1,0
1,managers journal sniffing drug abusers quick fix,"[managers, journal, sniffing, drug, abusers, q...",0,0,1
2,bank montreal royal bank profits rose 2nd period,"[bank, montreal, royal, bank, profits, rose, 2...",1,1,0
3,battle medical costs isnt,"[battle, medical, costs, isnt]",0,0,1


In [83]:


data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)



In [84]:
data_train

Unnamed: 0,Text_Final,tokens,sentiment_class,Pos,Neg
2873,group 20 seeks curbs bonuses caps us britain r...,"[group, 20, seeks, curbs, bonuses, caps, us, b...",1,1,0
2665,union head hits steel profits mcdonald takes u...,"[union, head, hits, steel, profits, mcdonald, ...",1,1,0
3801,market rally continues lively turnover pure oi...,"[market, rally, continues, lively, turnover, p...",1,1,0
1743,iraq crisis shows uss nonpareil power yet expo...,"[iraq, crisis, shows, uss, nonpareil, power, y...",0,0,1
1170,congress hoping influence future policy pushes...,"[congress, hoping, influence, future, policy, ...",1,1,0
...,...,...,...,...,...
4426,dc area sls hit soaring interest rates,"[dc, area, sls, hit, soaring, interest, rates]",1,1,0
466,stock prices slip dull session reflecting soft...,"[stock, prices, slip, dull, session, reflectin...",0,0,1
3092,heavy trading pushes stock averages higher p c...,"[heavy, trading, pushes, stock, averages, high...",1,1,0
3772,district line one weapon hasnt used yet,"[district, line, one, weapon, hasnt, used, yet]",0,0,1


In [85]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

39604 words total, with a vocabulary size of 8145
Max sentence length is 38


In [86]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

4401 words total, with a vocabulary size of 2180
Max sentence length is 29


In [87]:
word2vec_path = 'GoogleNews-vectors-negative300-SLIM.bin'
                 
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [88]:


def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)



In [89]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [90]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

In [91]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 8144 unique tokens.


In [92]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [93]:


train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)



(8145, 300)


In [94]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [95]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [96]:
label_names = ['Pos', 'Neg']

In [97]:


y_train = data_train[label_names].values



In [98]:
x_train = train_cnn_data
y_tr = y_train

In [99]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 50, 300)      2443500     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 49, 200)      120200      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 48, 200)      180200      embedding[0][0]                  
______________________________________________________________________________________________

In [100]:
num_epochs = 3
batch_size = 34

In [101]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [102]:


predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)





In [103]:
labels = [1, 0]

In [104]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [106]:
sum(data_test.sentiment_class==prediction_labels)/len(prediction_labels)

0.8206185567010309

In [107]:
data_test.sentiment_class.value_counts()

1    256
0    229
Name: sentiment_class, dtype: int64