# Import for Loading Custom Dataset using TorchText

In [19]:
import torchtext
import re
from torchtext import data
from torchtext import vocab

# Import for NN

In [20]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.nn.functional as F
from torch.autograd import Variable
import pandas as pd
import numpy as np
import torch.optim as optim

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm, tqdm_notebook, tnrange

import math

In [21]:
tqdm.pandas(desc='Progress')

# Batch Generator

If we use BucketIterator that is provided by TorchText, it returns a Batch object. Instead we can do two things : 
1. Write extra code in the training loop
2. Write iterable wrapper around batch Object from which we can get desired data.

BatchGenerator does (2).

In [22]:
class BatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)

# Tokenizer

This is the tokenizer used by torchtext for doing preprocessing on the text field.

In [23]:
def tokenizer(s): 
    return [w.lower() for w in tweet_clean(s)]

# Cleaning Data

This is called within the tokenizer to remove unnecessary aspects from the text data.

In [24]:
def tweet_clean(text):
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric character
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',text)
    # re.sub(r'https?:/\/\S+', ' ', text) # remove links
#     text = text.replace(',',';')
    return text.strip()

# Create Train Dataset

In [25]:
def CreateTrainDS(valid):
#     valid = 'Bitcoin'
    result = pd.DataFrame()
    fields = ['Article','Label']
    file_dict = {'Altcoin':'newsAltcoin.tsv','Binance':'newsBinance.tsv','Bitcoin':'newsBitcoin.tsv','Blockchain':'newsBlockchain.tsv','CoinBase':'newsCoinBase.tsv','Ethereum':'newsEth.tsv','ICO':'newsICO.tsv','Litecoin':'newsLitecoin.tsv','Mining':'newsMining.tsv','Poloniex':'newsPoloniex.tsv','Satoshi':'newsSatoshi.tsv','Wallet':'newsWallet.tsv'}
    valid_file = "/home/nithin/Git/Cryptic/SentimentAnalysis/News/TSV/"+file_dict[valid]
    train_file = "/home/nithin/Git/Cryptic/SentimentAnalysis/News/TSV/NewsTrain.tsv"
    for i in file_dict:
        df = pd.DataFrame()
        if(i!=valid):
            df = pd.read_csv("/home/nithin/Git/Cryptic/SentimentAnalysis/News/TSV/"+file_dict[i], sep = '\t', usecols = fields)
            if(result.empty):
                result = df
            else:
                result = pd.merge(result, df, how = "outer", on = fields)
    df = pd.DataFrame()
    df = pd.read_csv(valid_file, sep = '\t', usecols = fields)
    result = pd.merge(result, df, how = "left", on = fields)
    result.to_csv(train_file, sep = '\t', header = fields, index = False)
    return train_file, valid_file

# Field Definition

In [26]:
txt_field = data.Field(sequential=True, 
                       # tokenize=tokenizer, 
                       include_lengths=True, 
                       use_vocab=True)
label_field = data.Field(sequential=False, 
                             use_vocab=False, 
                             pad_token=None, 
                             unk_token=None)

# Defining Train and Validation Sets

In [27]:
def DefTrainValid(train_file,valid_File):
    train_val_fields = [
        ('Article', txt_field), # process it as text
        ('Label', label_field) # process it as label
    ]
    trainds, valds = data.TabularDataset.splits(path='/home/nithin/Git/Cryptic/SentimentAnalysis/News/TSV', 
                                            format='tsv', 
                                            train=train_file, 
                                            validation=valid_File, 
                                            fields=train_val_fields, 
skip_header=True)
    return trainds, valds

## Get the capital amount from Trading Pair

Capital value is hardcoded right now. If the customer wants to trade x units of currency A for currency B, convert x of A into dollars and use this value as capital.

curr1 and curr2 are full names of currencies. If shortforms are used, change key values in file_dict

In [28]:
def NewsSentimentAnalysis(curr1,curr2):
#     capital = unitsOfCurr1 * usdExchangeValue
    capital = curr1 * 50 
    valid = curr2 
    train_file, valid_file = CreateTrainDS(valid)
    trainds, valds = DefTrainValid(train_file, valid_file)
    vec = vocab.Vectors('glove.twitter.27B.100d.txt', '/home/nithin/Git/Cryptic/GloVe-1.2')
    txt_field.build_vocab(trainds, valds, max_size=200000, vectors=vec)
    label_field.build_vocab(trainds)
    traindl, valdl = data.BucketIterator.splits(datasets=(trainds, valds), 
                                            batch_sizes=(512,1024), 
                                            sort_key=lambda x: len(x.Article), 
                                            device=-1, 
                                            sort_within_batch=True, 
                                            repeat=False)
    train_batch_it = BatchGenerator(traindl, 'Article', 'Label') # use the wrapper to convert Batch to data
    val_batch_it = BatchGenerator(valdl, 'Article', 'Label')
    
    vocab_size = len(txt_field.vocab)
    embedding_dim = 100
    n_hidden = 64
    n_out = 3

    m = SimpleGRU(vocab_size, embedding_dim, n_hidden, n_out, trainds.fields['Article'].vocab.vectors)
    opt = optim.Adam(filter(lambda p: p.requires_grad, m.parameters()), 1e-3)

    fit(capital, model=m, train_dl=train_batch_it, val_dl=val_batch_it, loss_fn=F.nll_loss, opt=opt, epochs=15)

# GRU and ConcatPooling Models

In [29]:
class SimpleGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out, pretrained_vec, bidirectional=True):
        super().__init__()
        self.vocab_size,self.embedding_dim,self.n_hidden,self.n_out,self.bidirectional = vocab_size, embedding_dim, n_hidden, n_out, bidirectional
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.emb.weight.data.copy_(pretrained_vec)
        self.emb.weight.requires_grad = False
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden, bidirectional=bidirectional)
        self.out = nn.Linear(self.n_hidden, self.n_out)
        
    def forward(self, seq, lengths):
        bs = seq.size(1) # batch size
        seq = seq.transpose(0,1)
        self.h = self.init_hidden(bs) # initialize hidden state of GRU
        embs = self.emb(seq)
        embs = embs.transpose(0,1)
        embs = pack_padded_sequence(embs, lengths) # unpad
        gru_out, self.h = self.gru(embs, self.h) # gru returns hidden state of all timesteps as well as hidden state at last timestep
        gru_out, lengths = pad_packed_sequence(gru_out) # pad the sequence to the max length in the batch
        # since it is as classification problem, we will grab the last hidden state
        outp = self.out(self.h[-1]) # self.h[-1] contains hidden state of last timestep
#         return F.log_softmax(outp, dim=-1)
        return F.log_softmax(outp)
    
    def init_hidden(self, batch_size): 
        if self.bidirectional:
            return torch.zeros((2,batch_size,self.n_hidden))
        else:
            return torch.zeros((1,batch_size,self.n_hidden))

In [30]:
class ConcatPoolingGRUAdaptive(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out, pretrained_vec, bidirectional=True):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_hidden = n_hidden
        self.n_out = n_out
        self.bidirectional = bidirectional
        
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.emb.weight.data.copy_(pretrained_vec) # load pretrained vectors
        self.emb.weight.requires_grad = False # make embedding non trainable
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden, bidirectional=bidirectional)
        if bidirectional:
            self.out = nn.Linear(self.n_hidden*2*2, self.n_out)
        else:
            self.out = nn.Linear(self.n_hidden*2, self.n_out)
        
    def forward(self, seq, lengths):
        bs = seq.size(1)
        self.h = self.init_hidden(bs)
        seq = seq.transpose(0,1)
        embs = self.emb(seq)
        embs = embs.transpose(0,1)
        embs = pack_padded_sequence(embs, lengths)
        gru_out, self.h = self.gru(embs, self.h)
        gru_out, lengths = pad_packed_sequence(gru_out)        
        
        avg_pool = F.adaptive_avg_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)
        max_pool = F.adaptive_max_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)        
        outp = self.out(torch.cat([avg_pool,max_pool],dim=1))
        return F.log_softmax(outp, dim=-1)
    
    def init_hidden(self, batch_size): 
        if self.bidirectional:
            return torch.zeros((2,batch_size,self.n_hidden))
        else:
            return torch.zeros((1,batch_size,self.n_hidden))

# Fit Function

In [37]:
def fit(capital, model, train_dl, val_dl, loss_fn, opt, epochs=3):
    num_batch = len(train_dl)
    senti_dict = {}
    
    for epoch in tnrange(epochs):      
        y_true_train = list()
        y_pred_train = list()
        total_loss_train = 0          
        
        t = tqdm_notebook(iter(train_dl), leave=False, total=num_batch)
        for (X,lengths),y in t:
            t.set_description("Epoch {0}".format(epoch))
            lengths = lengths.cpu().numpy()
                
            opt.zero_grad()
            pred = model(X, lengths)
            loss = loss_fn(pred, y)
            loss.backward()
            opt.step()
            
            t.set_postfix(loss=loss.item())
            pred_idx = torch.max(pred, dim=1)[1]
            
            y_true_train += list(y.cpu().data.numpy())
            y_pred_train += list(pred_idx.cpu().data.numpy())
            total_loss_train += loss.item()
            
        train_acc = accuracy_score(y_true_train, y_pred_train)
        train_loss = total_loss_train/len(train_dl)
        
        if val_dl:
            y_true_val = list()
            y_pred_val = list()
            total_loss_val = 0
            for (X,lengths),y in tqdm_notebook(val_dl, leave=False):
                pred = model(X, lengths.cpu().numpy())
                loss = loss_fn(pred, y)
                pred_idx = torch.max(pred, 1)[1]
                y_true_val += list(y.cpu().data.numpy())
                y_pred_val += list(pred_idx.cpu().data.numpy())
                total_loss_val += loss.item()
            valacc = accuracy_score(y_true_val, y_pred_val)
            valloss = total_loss_val/len(val_dl)
            y4senti = [i for i in y_pred_val if i!=1]
            avg_senti = np.mean(y4senti)
            if(math.isnan(avg_senti)):
                avg_senti = 1.0
            senti_dict[valacc] = avg_senti
            
            if(avg_senti<1.0):
                new_amt = (avg_senti/2)*capital
            elif(avg_senti==1.0):
                new_amt = .5 * capital
            else:
                new_amt = (.5+((avg_senti - 1)/2))*capital
            print("Epoch {0}: train_loss: {1:.4f} train_acc: {2:.4f} | val_loss: {3:.4f} val_acc: {4:.4f} avg_senti: {5:.4}".format(epoch,train_loss,train_acc,valloss,valacc,avg_senti))
            print("Original Capital : {0} New Capital : {1}".format(capital,new_amt))
        else:
            print("Epoch {0}: train_loss: {1:.4f} train_acc: {2:.4f}".format(epoch,train_loss,train_acc))
    if(val_dl):
        print("\nMaximum accuracy and corresponding sentiment : ", max(senti_dict), senti_dict[max(senti_dict)])
        if(avg_senti<1.0):
                new_amt = (avg_senti/2)*capital
                final_senti = 0 + ((avg_senti)/2)*100
        elif(avg_senti==1.0):
            new_amt = .5 * capital
            final_senti = 50
        else:
            new_amt = (.5+((avg_senti - 1)/2))*capital
            final_senti = 50 + ((avg_senti-1)/2)*100
        print("Original Capital : {0} New Capital : {1}".format(capital,new_amt))
        print("Final Sentiment : ",final_senti)
        return final_senti

In [38]:
finalSentiment = NewsSentimentAnalysis(50,"Altcoin")

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))



HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 0: train_loss: 1.1267 train_acc: 0.1176 | val_loss: 1.0252 val_acc: 0.4615 avg_senti: 0.0
Original Capital : 2500 New Capital : 0.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 1: train_loss: 1.0660 train_acc: 0.3908 | val_loss: 0.9697 val_acc: 0.6923 avg_senti: 0.0
Original Capital : 2500 New Capital : 0.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 2: train_loss: 1.0102 train_acc: 0.6807 | val_loss: 0.9184 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 3: train_loss: 0.9590 train_acc: 0.7605 | val_loss: 0.8712 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 4: train_loss: 0.9121 train_acc: 0.7899 | val_loss: 0.8278 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 5: train_loss: 0.8694 train_acc: 0.7983 | val_loss: 0.7881 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 6: train_loss: 0.8307 train_acc: 0.7983 | val_loss: 0.7518 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 7: train_loss: 0.7957 train_acc: 0.7983 | val_loss: 0.7190 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 8: train_loss: 0.7644 train_acc: 0.7983 | val_loss: 0.6894 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 9: train_loss: 0.7366 train_acc: 0.7983 | val_loss: 0.6631 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 10: train_loss: 0.7123 train_acc: 0.7983 | val_loss: 0.6400 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 11: train_loss: 0.6913 train_acc: 0.7983 | val_loss: 0.6202 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 12: train_loss: 0.6736 train_acc: 0.7983 | val_loss: 0.6034 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 13: train_loss: 0.6591 train_acc: 0.7983 | val_loss: 0.5898 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Epoch 14: train_loss: 0.6475 train_acc: 0.7983 | val_loss: 0.5790 val_acc: 0.8462 avg_senti: 1.0
Original Capital : 2500 New Capital : 1250.0


Maximum accuracy and corresponding sentiment :  0.8461538461538461 1.0
Original Capital : 2500 New Capital : 1250.0
Final Sentiment :  50
