In [2]:
import warnings
warnings.filterwarnings('ignore')

import random
import time
import multiprocessing as mp
import numpy as np

import mxnet as mx
from mxnet import nd, gluon, autograd

import gluonnlp as nlp

random.seed(123)
np.random.seed(123)
mx.random.seed(123)

In [3]:
class MeanPoolingLayer(gluon.HybridBlock):
    """A block for mean pooling of encoder features"""
    def __init__(self, prefix=None, params=None):
        super(MeanPoolingLayer, self).__init__(prefix=prefix, params=params)

    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
        """Forward logic"""
        # Data will have shape (T, N, C)
        masked_encoded = F.SequenceMask(data,
                                        sequence_length=valid_length,
                                        use_sequence_length=True)
        agg_state = F.broadcast_div(F.sum(masked_encoded, axis=0),
                                    F.expand_dims(valid_length, axis=1))
        return agg_state


class SentimentNet(gluon.HybridBlock):
    """Network for sentiment analysis."""
    def __init__(self, dropout, prefix=None, params=None):
        super(SentimentNet, self).__init__(prefix=prefix, params=params)
        with self.name_scope():
            self.embedding = None # will set with lm embedding later
            self.encoder = None # will set with lm encoder later
            self.agg_layer = MeanPoolingLayer()
            self.output = gluon.nn.HybridSequential()
            with self.output.name_scope():
                self.output.add(gluon.nn.Dropout(dropout))
                self.output.add(gluon.nn.Dense(1, flatten=False))

    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
        encoded = self.encoder(self.embedding(data))  # Shape(T, N, C)
        agg_state = self.agg_layer(encoded, valid_length)
        out = self.output(agg_state)
        return out

In [4]:
dropout = 0
language_model_name = 'standard_lstm_lm_200'
pretrained = True
learning_rate, batch_size = 0.005, 32
bucket_num, bucket_ratio = 10, 0.2
epochs = 1
grad_clip = None
log_interval = 100

In [5]:
context = mx.cpu(0)

In [6]:
lm_model, vocab = nlp.model.get_model(name=language_model_name,
                                      dataset_name='wikitext-2',
                                      pretrained=pretrained,
                                      ctx=context,
                                      dropout=dropout)

Vocab file is not found. Downloading.
Downloading /Users/w08459/.mxnet/models/1563979745.4440992wikitext-2-be36dc52.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/wikitext-2-be36dc52.zip...
Downloading /Users/w08459/.mxnet/models/standard_lstm_lm_200_wikitext-2-b233c700.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/standard_lstm_lm_200_wikitext-2-b233c700.zip...


In [7]:
net = SentimentNet(dropout=dropout)
net.embedding = lm_model.embedding
net.encoder = lm_model.encoder
net.hybridize()
net.output.initialize(mx.init.Xavier(), ctx=context)
print(net)

SentimentNet(
  (embedding): HybridSequential(
    (0): Embedding(33278 -> 200, float32)
  )
  (encoder): LSTM(200 -> 200, TNC, num_layers=2)
  (agg_layer): MeanPoolingLayer(
  
  )
  (output): HybridSequential(
    (0): Dropout(p = 0, axes=())
    (1): Dense(None -> 1, linear)
  )
)


In [14]:
# The tokenizer takes as input a string and outputs a list of tokens.
tokenizer = nlp.data.SpacyTokenizer('en')

# `length_clip` takes as input a list and outputs a list with maximum length 500.
length_clip = nlp.data.ClipSequence(500)

# Helper function to preprocess a single data point
def preprocess(x):
    data, label = x
    label = int(label > 5)
    # A token index or a list of token indices is
    # returned according to the vocabulary.
    data = vocab[length_clip(tokenizer(data))]
    return data, label

# Helper function for getting the length
def get_length(x):
    return float(len(x[0]))

# Loading the dataset
train_dataset, test_dataset = [nlp.data.IMDB(root='data/imdb', segment=segment)
                               for segment in ('train', 'test')]
print('Tokenize using spaCy...')

Downloading data/imdb/train.json from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/imdb/train.json...
Downloading data/imdb/test.json from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/imdb/test.json...
Tokenize using spaCy...


In [15]:
def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
        lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
    end = time.time()
    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

# Doing the actual pre-processing of the dataset
train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
test_dataset, test_data_lengths = preprocess_dataset(test_dataset)

Done! Tokenizing Time=16.95s, #Sentences=25000
Done! Tokenizing Time=17.71s, #Sentences=25000


In [16]:
# Construct the DataLoader

def get_dataloader():

    # Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0, ret_length=True),
        nlp.data.batchify.Stack(dtype='float32'))
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_data_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())

    # Construct a DataLoader object for both the training and test data
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    test_dataloader = gluon.data.DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, test_dataloader

# Use the pre-defined function to make the retrieval of the DataLoader objects simple
train_dataloader, test_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=25000, batch_num=779
  key=[59, 108, 157, 206, 255, 304, 353, 402, 451, 500]
  cnt=[591, 1999, 5092, 5108, 3035, 2084, 1476, 1164, 871, 3580]
  batch_size=[54, 32, 32, 32, 32, 32, 32, 32, 32, 32]


In [17]:
def evaluate(net, dataloader, context):
    loss = gluon.loss.SigmoidBCELoss()
    total_L = 0.0
    total_sample_num = 0
    total_correct_num = 0
    start_log_interval_time = time.time()

    print('Begin Testing...')
    for i, ((data, valid_length), label) in enumerate(dataloader):
        data = mx.nd.transpose(data.as_in_context(context))
        valid_length = valid_length.as_in_context(context).astype(np.float32)
        label = label.as_in_context(context)
        output = net(data, valid_length)

        L = loss(output, label)
        pred = (output > 0.5).reshape(-1)
        total_L += L.sum().asscalar()
        total_sample_num += label.shape[0]
        total_correct_num += (pred == label).sum().asscalar()

        if (i + 1) % log_interval == 0:
            print('[Batch {}/{}] elapsed {:.2f} s'.format(
                i + 1, len(dataloader),
                time.time() - start_log_interval_time))
            start_log_interval_time = time.time()

    avg_L = total_L / float(total_sample_num)
    acc = total_correct_num / float(total_sample_num)

    return avg_L, acc

In [18]:
def train(net, context, epochs):
    trainer = gluon.Trainer(net.collect_params(), 'ftml',
                            {'learning_rate': learning_rate})
    loss = gluon.loss.SigmoidBCELoss()

    parameters = net.collect_params().values()

    # Training/Testing
    for epoch in range(epochs):
        # Epoch training stats
        start_epoch_time = time.time()
        epoch_L = 0.0
        epoch_sent_num = 0
        epoch_wc = 0
        # Log interval training stats
        start_log_interval_time = time.time()
        log_interval_wc = 0
        log_interval_sent_num = 0
        log_interval_L = 0.0

        for i, ((data, length), label) in enumerate(train_dataloader):
            L = 0
            wc = length.sum().asscalar()
            log_interval_wc += wc
            epoch_wc += wc
            log_interval_sent_num += data.shape[1]
            epoch_sent_num += data.shape[1]
            with autograd.record():
                output = net(data.as_in_context(context).T,
                             length.as_in_context(context)
                                   .astype(np.float32))
                L = L + loss(output, label.as_in_context(context)).mean()
            L.backward()
            # Clip gradient
            if grad_clip:
                gluon.utils.clip_global_norm(
                    [p.grad(context) for p in parameters],
                    grad_clip)
            # Update parameter
            trainer.step(1)
            log_interval_L += L.asscalar()
            epoch_L += L.asscalar()
            if (i + 1) % log_interval == 0:
                print(
                    '[Epoch {} Batch {}/{}] elapsed {:.2f} s, '
                    'avg loss {:.6f}, throughput {:.2f}K wps'.format(
                        epoch, i + 1, len(train_dataloader),
                        time.time() - start_log_interval_time,
                        log_interval_L / log_interval_sent_num, log_interval_wc
                        / 1000 / (time.time() - start_log_interval_time)))
                # Clear log interval training stats
                start_log_interval_time = time.time()
                log_interval_wc = 0
                log_interval_sent_num = 0
                log_interval_L = 0
        end_epoch_time = time.time()
        test_avg_L, test_acc = evaluate(net, test_dataloader, context)
        print('[Epoch {}] train avg loss {:.6f}, test acc {:.2f}, '
              'test avg loss {:.6f}, throughput {:.2f}K wps'.format(
                  epoch, epoch_L / epoch_sent_num, test_acc, test_avg_L,
                  epoch_wc / 1000 / (end_epoch_time - start_epoch_time)))

In [19]:
train(net, context, epochs)

[Epoch 0 Batch 100/779] elapsed 102.58 s, avg loss 0.002341, throughput 7.85K wps
[Epoch 0 Batch 200/779] elapsed 102.01 s, avg loss 0.001808, throughput 7.63K wps
[Epoch 0 Batch 300/779] elapsed 93.81 s, avg loss 0.001447, throughput 9.10K wps
[Epoch 0 Batch 400/779] elapsed 86.68 s, avg loss 0.001342, throughput 9.19K wps
[Epoch 0 Batch 500/779] elapsed 79.96 s, avg loss 0.001392, throughput 9.24K wps
[Epoch 0 Batch 600/779] elapsed 87.66 s, avg loss 0.001153, throughput 8.90K wps
[Epoch 0 Batch 700/779] elapsed 91.36 s, avg loss 0.001224, throughput 8.42K wps
Begin Testing...
[Batch 100/782] elapsed 86.68 s
[Batch 200/782] elapsed 87.01 s
[Batch 300/782] elapsed 82.55 s
[Batch 400/782] elapsed 82.23 s
[Batch 500/782] elapsed 82.32 s
[Batch 600/782] elapsed 81.84 s
[Batch 700/782] elapsed 82.59 s
[Epoch 0] train avg loss 0.001494, test acc 0.86, test avg loss 0.306831, throughput 8.56K wps


In [73]:
file_name = "training_w_imdb"
net.save_parameters(file_name)

In [20]:
q = ['The', 'Wall', 'Street', 'bank', 'is', 'moving', 'away', 'from', 'high-risk', 'businesses', 
     'like', 'trading', 'that', 'were', 'once', 'lucrative', 'but', 'have', 'since', 'slowed,', 
     'and', 'towards', 'more', 'stable', 'areas', 'like', 'consumer', 'lending']

In [24]:
q=['I', 'like', 'big', 'butts', 'and', 'I', 'cannot', 'lie']

In [26]:
q = ['Your', 'mother', 'was', 'a', 'hampster', 'and', 'your', 'father', 'smells', 'of', 'elderberries']

In [112]:
q = ['Great', 'for', 'reinforcing', 'your', 'liberal', 'knee-jerks', 'if', 'you''re', 'out', 'of', 'practice.', 
     'Otherwise,', 'stick', 'to', 'Law', '&', 'Order.', 'Has', 'little', 'to', 'do', 'with', 'real', 'people', 'or', 
     'real', 'life.', 'That''s', 'why', 'the', 'critics', 'love', 'it.', 'The', 'characters', 'are', 'about', 'as',
     'real', 'as', 'the', 'clean,', 'well-groomed', 'bikers', 'in', 'the', 'afternoon', 'soaps.', 'David', 
     'Simon''s', 'concept', 'of', 'sociopolitical', 'seems', 'to', 'be', 'beating', 'liberal', 'drums,', 'and', 
     'not', 'much', 'else.', 'It', 'takes', 'more', 'than', 'a', 'little', 'dirt', 'and', 'a', 'few', 'scars', 
     'to', 'portray', 'real', 'criminals', 'and', 'victims.', 'It', 'takes', 'going', 'beyond', 'the', 'typical', 
     'liberals', 'vs.', 'bad', 'guys', 'There', 'are', 'world', 'views', 'that', 'encompass', 'those', 'beyond', 
     'Planned', 'Parenthood,', 'Al', 'Sharpton', 'and', 'the', 'NAACP,', 'but', 'you', 'have', 'to', 'turn', 'the', 
     'channel', 'to', 'find', 'them.']

In [114]:
q = ['Yeh', 'this', 'show', 'is', 'just', 'plane', 'out', 'bad.', 'I', 'really', 'wanted', 'to', 'like', 'like', 'this', 'but', 'I', 'mean', 'there', 'was', 'nothing', 'at', 'all', 'good', 'about.', 'The', 'characters', 'were', 'just', 'bad', 'and', 'uninteresting.', 'Then', 'the', 'writing', 'was', 'done', 'by', 'people', 'who', 'may', 'have', 'known', 'about', 'the', 'incity', 'but', 'nothing', 'about', 'plot.', 'There', 'is', 'a', 'reason', 'this', 'show', 'wasn''t', 'Nominated', 'for', 'anything', 'it', 'sucked.', 'I', 'mean', 'the', 'theams', 'are', 'good', 'but', 'so', 'are', 'the', 'theams', 'in', 'The', 'Phantom', 'Menace', 'wich', 'doesn''t', 'make', 'it', 'a', 'good', 'movie.']

In [116]:
q = ['Hate', 'to', 'be', 'rude', 'but', 'don''t', 'pay', 'attention', 'to', 'the', 'moronic', 'post', 'below.', 'That', 'was', 'some', 'of', 'the', 'most', 'lame', 'criticism', 'I', 'have', 'ever', 'come', 'across', 'on', 'this', 'site.', 'I', 'doubt', 'the', 'guy', 'even', 'watched', 'the', 'entire', 'first', 'season.', 'This', 'show', 'is', 'the', 'best', 'thing', 'going', 'on', 'TV.', 'Writing.', 'Direction.', 'Acting.', 'Its', 'all', 'perfection.', 'The', 'people', 'behind', 'the', 'show', 'are', 'former', 'journalists', 'and', 'police', 'officers', 'who', 'were', 'covering', 'crime', 'in', 'Baltimore', 'or', 'working', 'the', 'beat', 'as', 'cops', 'for', 'over', '20', 'years.', 'They', 'know', 'what', 'they', 'speak', 'of', 'and', 'don''t', 'rely', 'on', 'cookie', 'cutter', 'characterization.', 'This', 'is', 'the', 'closest', 'thing', 'to', 'a', 'novel', 'that', 'you', 'will', 'find', 'on', 'TV.', 'It', 'is', 'so', 'impeccably', 'plotted', 'and', 'so', 'honest', 'and', 'realistic', 'that', 'I', 'will', 'never', 'be', 'able', 'to', 'watch', 'another', 'cop', 'show', '(or', 'any', 'TV', 'drama)', 'without', 'comparing', 'it', 'to', 'this', 'example', 'of', 'television', 'greatness.', 'Did', 'I', 'mention', 'its', 'also', 'the', 'smartest', 'TV', 'show', 'on', 'the', 'air', 'too?', 'The', 'Sopranos', 'gets', 'the', 'media', 'attention', 'but', 'it', 'can''t', 'match', 'the', 'sophistication', 'and', 'grittiness', 'of', 'The', 'Wire.', 'The', 'Sopranos', 'is', 'a', 'romanticized', 'TV', 'crime', 'drama', 'by', 'comparison.', 'And', 'as', 'for', 'Six', 'Feet', 'Under?', 'Please!', 'It', 'reached', 'its', 'peak', 'in', 'its', 'final', 'six', 'episodes', 'of', 'the', 'first', 'season', 'and', 'haven''t', 'lived', 'up', 'to', 'that', 'magic', 'since.', 'It', 'doesn''t', 'get', 'any', 'better', 'than', 'The', 'Wire.', 'Universal', 'critical', 'acclaim.', 'The', 'winner', 'of', 'the', '2002', 'TV', 'Critics', 'awards.', 'The', 'winner', 'of', 'the', '2004', 'Peabody', 'award.', 'Nuff', 'said.']

In [118]:
q = ['HBO''s', '"The', 'Wire",', 'another', 'ground', 'breaking', 'TV', 'crime', 'series', 'from', 'David', 'Simon', 'who', 'grandfathered', '"Homicide:', 'Life', 'on', 'the', 'Street",', 'raises', 'the', 'bar', 'for', 'crime', 'dramas', 'by', 'dedicating', 'a', 'whole', 'season', '(13', 'episodes)', 'to', 'a', 'single', 'story', 'with', 'unparalleled', 'realism.', 'Telling', 'of', 'a', 'motley', 'bunch', 'of', 'detectives', 'who', 'set', 'about', 'to', 'bring', 'down', 'a', 'Baltimore', 'drug', 'ring', 'which', 'supplies', 'a', 'black', 'innercity', 'housing', 'project,', 'the', 'gritty', '12', 'hour', 'first', 'year', 'series', 'slowly', 'develops', 'a', 'broad', 'range', 'of', 'characters', 'from', 'street', 'punks', 'to', 'senators', 'in', 'a', 'world', 'where', 'the', 'blacks', 'and', 'whites', 'of', 'good', 'and', 'evil', 'are', 'reduced', 'to', 'shades', 'of', 'gray', 'and', 'everyone', 'is', 'connected', 'by', 'their', 'humanity', 'for', 'better', 'or', 'for', 'worse.', 'Not', 'the', 'usual', 'cops', 'vs', 'bad', 'guys', 'fare', 'with', 'episodic', 'ups', 'and', 'downs,', '"The', 'Wire"', 'is', 'one', 'long', 'drama', 'about', 'people', 'which', 'happens', 'in', 'a', 'law', 'enforcement', 'and', 'crime', 'setting.', 'For', 'realists', 'only,', 'this', 'series', 'will', 'require', 'some', 'viewer', 'patience', 'while', 'the', 'complexities', 'of', 'the', 'plot', 'and', 'the', 'characters', 'are', 'developed.', 'One', 'of', 'a', 'kind...so', 'far.', '(A)']

In [137]:
q = ['the', 'show', 'is','great']

In [138]:
net(
    mx.nd.reshape(
        mx.nd.array(vocab[q], ctx=context),
        shape=(-1, 1)), mx.nd.array([4], ctx=context)).sigmoid()


[[0.8228829]]
<NDArray 1x1 @cpu(0)>

In [28]:
q = ['Private', 'equity', 'firms', 'Bain', 'Capital', 'and', 'Hellman', 'Friedman', 'agree', 'to', 'a', 'deal', 'to', 'buy', 'Lehman', 'Brothers', 'Holdings''prized', 'Neuberger', 'Berman', 'asset', 'management', 'unit', 'and', 'other', 'businesses', 'for', '$2.15', 'billion,', 'less', 'than', 'original', 'estimates', 'of', 'its', 'worth.']

In [49]:
q = ['Morgan', 'Stanley', 'shares', 'fall', 'nearly', '8', 'percent,', 'following', 'news', 'that', 'Japan''s', 'biggest', 'bank,', 'Mitsubishi', 'UFJ', 'Financial', 'Group,', 'will', 'take', 'a', '21-percent', 'stake', 'in', 'the', 'Wall', 'Street', 'firm.']

In [47]:
q = ['it', 'was', 'amazing', 'the', 'whole', 'time']

## Training with better data

In [53]:
import json, requests
import pandas as pd

In [None]:
#q = 'https://stocknewsapi.com/api/v1?tickers=GS&items=50&token=vteixkw9an9ggtxuv54gqqrptqmx5fxoxtct73vs'
#r = requests.get(q)

In [None]:
j = r.json()
j

In [None]:
o = []
for i in j['data']:
    oo = {'title':i['title'],'text':i['text'], 'date':i['date'], 'sentiment':i['sentiment']}
    o.append(oo)
    
o = pd.DataFrame(o)
o.to_csv('')
o

In [54]:
df = pd.read_csv('train_data_50_rows.tsv', sep='\t')
df.head()

Unnamed: 0,date,sentiment,text,title
0,"Wed, 24 Jul 2019 09:35:00 -0400",Positive,Now that we've seen second-quarter earnings fr...,3 Top Bank Stocks to Buy Right Now
1,"Mon, 22 Jul 2019 18:07:27 -0400",Neutral,"A Goldman Sachs bond deal went bust, and a gro...",Why Private Detectives Are Eyeing a Goldman Bo...
2,"Mon, 22 Jul 2019 14:31:07 -0400",Positive,"The ""Halftime Report"" traders debate Goldman S...",Goldman upgrades these chip stocks
3,"Fri, 19 Jul 2019 13:15:02 -0400",Negative,"EVP, Global Head of HCM of Goldman Sachs Group...","Goldman Sachs Group Inc (GS) EVP, Global Head ..."
4,"Fri, 19 Jul 2019 10:10:21 -0400",Positive,Goldman Sachs BDC (GSBD) has an impressive ear...,Can Goldman Sachs BDC (GSBD) Keep the Earnings...


In [72]:
from datetime import datetime
df['date'] = [str(datetime.strptime(x[5:16], '%d %b %Y').date()) for x in df['date']]
df.head()

Unnamed: 0,date,sentiment,text,title
0,2019-07-24,Positive,Now that we've seen second-quarter earnings fr...,3 Top Bank Stocks to Buy Right Now
1,2019-07-22,Neutral,"A Goldman Sachs bond deal went bust, and a gro...",Why Private Detectives Are Eyeing a Goldman Bo...
2,2019-07-22,Positive,"The ""Halftime Report"" traders debate Goldman S...",Goldman upgrades these chip stocks
3,2019-07-19,Negative,"EVP, Global Head of HCM of Goldman Sachs Group...","Goldman Sachs Group Inc (GS) EVP, Global Head ..."
4,2019-07-19,Positive,Goldman Sachs BDC (GSBD) has an impressive ear...,Can Goldman Sachs BDC (GSBD) Keep the Earnings...


In [74]:
df['collated'] = df['title'] + '. '+ df['text']
df.head()

Unnamed: 0,date,sentiment,text,title,collated
0,2019-07-24,Positive,Now that we've seen second-quarter earnings fr...,3 Top Bank Stocks to Buy Right Now,3 Top Bank Stocks to Buy Right Now. Now that w...
1,2019-07-22,Neutral,"A Goldman Sachs bond deal went bust, and a gro...",Why Private Detectives Are Eyeing a Goldman Bo...,Why Private Detectives Are Eyeing a Goldman Bo...
2,2019-07-22,Positive,"The ""Halftime Report"" traders debate Goldman S...",Goldman upgrades these chip stocks,"Goldman upgrades these chip stocks. The ""Halft..."
3,2019-07-19,Negative,"EVP, Global Head of HCM of Goldman Sachs Group...","Goldman Sachs Group Inc (GS) EVP, Global Head ...","Goldman Sachs Group Inc (GS) EVP, Global Head ..."
4,2019-07-19,Positive,Goldman Sachs BDC (GSBD) has an impressive ear...,Can Goldman Sachs BDC (GSBD) Keep the Earnings...,Can Goldman Sachs BDC (GSBD) Keep the Earnings...


In [75]:
k = {'Positive':1, 'Neutral':0.5, 'Negative':0}

In [77]:
df['score'] = [k[x] for x in df['sentiment']]
df.head()

Unnamed: 0,date,sentiment,text,title,collated,score
0,2019-07-24,Positive,Now that we've seen second-quarter earnings fr...,3 Top Bank Stocks to Buy Right Now,3 Top Bank Stocks to Buy Right Now. Now that w...,1.0
1,2019-07-22,Neutral,"A Goldman Sachs bond deal went bust, and a gro...",Why Private Detectives Are Eyeing a Goldman Bo...,Why Private Detectives Are Eyeing a Goldman Bo...,0.5
2,2019-07-22,Positive,"The ""Halftime Report"" traders debate Goldman S...",Goldman upgrades these chip stocks,"Goldman upgrades these chip stocks. The ""Halft...",1.0
3,2019-07-19,Negative,"EVP, Global Head of HCM of Goldman Sachs Group...","Goldman Sachs Group Inc (GS) EVP, Global Head ...","Goldman Sachs Group Inc (GS) EVP, Global Head ...",0.0
4,2019-07-19,Positive,Goldman Sachs BDC (GSBD) has an impressive ear...,Can Goldman Sachs BDC (GSBD) Keep the Earnings...,Can Goldman Sachs BDC (GSBD) Keep the Earnings...,1.0


In [82]:
df['collated'][0].split(' ')

['3',
 'Top',
 'Bank',
 'Stocks',
 'to',
 'Buy',
 'Right',
 'Now.',
 'Now',
 'that',
 "we've",
 'seen',
 'second-quarter',
 'earnings',
 'from',
 'the',
 'big',
 'banks,',
 'our',
 'contributors',
 'think',
 'these',
 'three',
 'are',
 'worth',
 'a',
 'closer',
 'look.']

In [80]:
q

['Morgan',
 'Stanley',
 'shares',
 'fall',
 'nearly',
 '8',
 'percent,',
 'following',
 'news',
 'that',
 'Japans',
 'biggest',
 'bank,',
 'Mitsubishi',
 'UFJ',
 'Financial',
 'Group,',
 'will',
 'take',
 'a',
 '21-percent',
 'stake',
 'in',
 'the',
 'Wall',
 'Street',
 'firm.']

In [108]:
net(
    mx.nd.reshape(
        mx.nd.array(vocab[q], ctx=context),
        shape=(-1, 1)), mx.nd.array([4], ctx=context)).sigmoid().asnumpy()[0][0]

0.6082939

In [110]:
r = [net(
    mx.nd.reshape(
        mx.nd.array(vocab[x[0].split(' ')], ctx=context),
        shape=(-1, 1)), mx.nd.array([4], ctx=context)).sigmoid().asnumpy()[0][0] for x in df['collated']]

df['pred'] = r
df.head()

Unnamed: 0,date,sentiment,text,title,collated,score,pred
0,2019-07-24,Positive,Now that we've seen second-quarter earnings fr...,3 Top Bank Stocks to Buy Right Now,3 Top Bank Stocks to Buy Right Now. Now that w...,1.0,0.494826
1,2019-07-22,Neutral,"A Goldman Sachs bond deal went bust, and a gro...",Why Private Detectives Are Eyeing a Goldman Bo...,Why Private Detectives Are Eyeing a Goldman Bo...,0.5,0.5145
2,2019-07-22,Positive,"The ""Halftime Report"" traders debate Goldman S...",Goldman upgrades these chip stocks,"Goldman upgrades these chip stocks. The ""Halft...",1.0,0.488803
3,2019-07-19,Negative,"EVP, Global Head of HCM of Goldman Sachs Group...","Goldman Sachs Group Inc (GS) EVP, Global Head ...","Goldman Sachs Group Inc (GS) EVP, Global Head ...",0.0,0.488803
4,2019-07-19,Positive,Goldman Sachs BDC (GSBD) has an impressive ear...,Can Goldman Sachs BDC (GSBD) Keep the Earnings...,Can Goldman Sachs BDC (GSBD) Keep the Earnings...,1.0,0.440644


In [111]:
df

Unnamed: 0,date,sentiment,text,title,collated,score,pred
0,2019-07-24,Positive,Now that we've seen second-quarter earnings fr...,3 Top Bank Stocks to Buy Right Now,3 Top Bank Stocks to Buy Right Now. Now that w...,1.0,0.494826
1,2019-07-22,Neutral,"A Goldman Sachs bond deal went bust, and a gro...",Why Private Detectives Are Eyeing a Goldman Bo...,Why Private Detectives Are Eyeing a Goldman Bo...,0.5,0.5145
2,2019-07-22,Positive,"The ""Halftime Report"" traders debate Goldman S...",Goldman upgrades these chip stocks,"Goldman upgrades these chip stocks. The ""Halft...",1.0,0.488803
3,2019-07-19,Negative,"EVP, Global Head of HCM of Goldman Sachs Group...","Goldman Sachs Group Inc (GS) EVP, Global Head ...","Goldman Sachs Group Inc (GS) EVP, Global Head ...",0.0,0.488803
4,2019-07-19,Positive,Goldman Sachs BDC (GSBD) has an impressive ear...,Can Goldman Sachs BDC (GSBD) Keep the Earnings...,Can Goldman Sachs BDC (GSBD) Keep the Earnings...,1.0,0.440644
5,2019-07-18,Negative,Booming stock markets around the globe helped ...,Goldman Sachs' equities-backed earnings result...,Goldman Sachs' equities-backed earnings result...,0.0,0.488803
6,2019-07-18,Positive,"The ""Halftime Report"" traders give their top s...","Final Trades: Abbott Labs, Goldman Sachs, Disn...","Final Trades: Abbott Labs, Goldman Sachs, Disn...",1.0,0.393133
7,2019-07-18,Positive,Don't fret! Value stocks can be had in today's...,3 Great Value Stocks to Buy This July,3 Great Value Stocks to Buy This July. Don't f...,1.0,0.494826
8,2019-07-18,Negative,Booming stock markets around the globe helped ...,Goldman Sachs' equities-backed earnings result...,Goldman Sachs' equities-backed earnings result...,0.0,0.488803
9,2019-07-17,Positive,GS released Q2 results July 16th and has recen...,"The Goldman Sachs Group, Inc. - Friend Of The ...","The Goldman Sachs Group, Inc. - Friend Of The ...",1.0,0.465639
