In [48]:
import torch
import torch.nn as nn 
from torch.autograd import Variable
import torch.nn.functional as F
import torchvision.datasets as dsets 
import torchvision.transforms as transforms
import torchtext 
from torchtext import data
from gensim.models import KeyedVectors
import gensim
import nltk
from sklearn.metrics import f1_score

In [49]:
vocab_size = 'na' 
embedding_dim = 300
n_filters = 100 
filter_sizes = [3,4,5]
output_dim = 3
droput = 0.5
#filename = 'GoogleNews-vectors-negative300.bin.gz'
#modelwv = KeyedVectors.load_word2vec_format(filename, binary=True)
#model = gensim.models.Word2Vec(filename)

In [50]:
import pandas as pd 

In [51]:
class CNNmodel(nn.Module):
    def __init__(self, vocab_size,embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super(CNNmodel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_0 = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size=(filter_sizes[0], embedding_dim))
        self.conv_1 = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size=(filter_sizes[1], embedding_dim))
        self.conv_2 = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size=(filter_sizes[2], embedding_dim))
        self.linear = nn.Linear(len(filter_sizes)*n_filters, output_dim)#
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        x = x.transpose(1,0)
        embedded = self.embedding(x) #o/p->[batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)#o/p->[batch size,1, sent len, emb dim]
        conved_0 = F.relu(self.conv_0(embedded.squeeze(3)))
        conved_1 = F.relu(self.conv_1(embedded.squeeze(3)))
        conved_2 = F.relu(self.conv_2(embedded.squeeze(3)))
        pooled_0 = F.max_pool1d(conved_0.squeeze(3), conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1.squeeze(3), conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2.squeeze(3), conved_2.shape[2]).squeeze(2)
        out = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
        out = self.linear(out)
        return out 

In [52]:
df = pd.read_csv( 'train.csv', index_col=None, header=0, engine='python' )

In [53]:
print(df.columns) 

Index(['Tweet', 'Target', 'Stance', 'Opinion Towards', 'Sentiment'], dtype='object')


In [54]:
labels_stance = df['Stance']

In [55]:
X_tweets = df['Tweet']

In [56]:
print(X_tweets)

0       @tedcruz And, #HandOverTheServer she wiped cle...
1       Hillary is our best choice if we truly want to...
2       @TheView I think our country is ready for a fe...
3       I just gave an unhealthy amount of my hard-ear...
4       @PortiaABoulger Thank you for adding me to you...
5       Hillary can not win. Here's hoping the Dems of...
6       Respect FOR the law and respect BY the law Yes...
7       I don't want to be appointed to an Ambassador ...
8       #StopHillary2016 @HillaryClinton if there was ...
9       @HillaryClinton End lawless #ClintonFoundation...
10      Use your brain, keep Hillary out of the White ...
11      @HillaryClinton Hillary pandering with her log...
12      @readyforHRC @HillaryClinton #HillaryClinton, ...
13         @CiaraAntaya cuz you know I'm such a feminist 
14      2 million bogus followers on Twitter @HillaryC...
15      @lindasuhler : My name is Rebecca and my grand...
16      Where's the campaign store is the real questio...
17      It's a

In [57]:
#Input should R(dxS)
#tokenizing and embedding required
batch_size = 100 
n_iter = 3000 
num_epoch = n_iter/(len(X_tweets)/batch_size)
num_epoch = int(num_epoch)
print(num_epoch)

102


In [58]:
labels_stance[:5]

0    AGAINST
1      FAVOR
2    AGAINST
3    AGAINST
4       NONE
Name: Stance, dtype: object

In [59]:
labels_stance.replace(['AGAINST','NONE','FAVOR'],[0,1,2],inplace=True)

In [60]:
print(labels_stance[:5])

#for i in labels_stance:
 #   y_onehot = (np.arange(3) == y_onehot[:,None]).astype(np.float32)

0    0
1    2
2    0
3    0
4    1
Name: Stance, dtype: int64


In [61]:
labels_torch = torch.tensor(labels_stance)

In [62]:
labels_loader = torch.utils.data.DataLoader(dataset= labels_stance, batch_size = batch_size, shuffle= True)

In [63]:
X_tweets = X_tweets.apply(nltk.word_tokenize)

In [64]:
cols = ['Tweet', 'Stance']

In [65]:
sdf = pd.DataFrame(df, columns=cols)

In [66]:
print(sdf[:3])

                                               Tweet  Stance
0  @tedcruz And, #HandOverTheServer she wiped cle...       0
1  Hillary is our best choice if we truly want to...       2
2  @TheView I think our country is ready for a fe...       0


In [67]:
sdf.to_csv('new_train.csv', index = False)

In [68]:
import random 
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
# dir(TEXT)

In [69]:
train_data = data.TabularDataset(path='./new_train.csv', format='csv',fields=[('Tweet', TEXT),('Stance', LABEL)], skip_header= True)

In [70]:
TEXT.build_vocab(train_data, max_size=25000, vectors= "glove.6B.300d")
LABEL.build_vocab(train_data)

In [71]:
len(train_data)

2914

In [72]:
print(vars(train_data.examples[4]))

{'Tweet': ['@PortiaABoulger', 'Thank', 'you', 'for', 'adding', 'me', 'to', 'your', 'list'], 'Stance': '1'}


In [73]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [74]:
SEED = 1234

In [75]:

train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print(vars(train_data.examples[4]))

{'Tweet': ['@tonylee01979', 'make', 'their', 'own', 'decisions', 'does', "n't", 'mean', 'you', 'need', 'to', 'harass', 'me', '.', 'Goodbye', 'now', '.', '#', 'freedom', '#', 'SemST'], 'Stance': '1'}


In [76]:
BATCH_SIZE = 64

In [77]:
train_iterator, valid_iterator= data.BucketIterator.splits(
    (train_data, valid_data), sort_key=lambda x: len(x.Tweet),
    batch_size=BATCH_SIZE, 
    device=device)


In [78]:
import numpy as np
for batch_idx, (x, y) in enumerate(train_iterator):
    y_onehot = y.numpy()
    print(y_onehot)
    y_onehot = (np.arange(3) == y_onehot[:,None]).astype(np.float32)
    print(y_onehot)
    y_onehot = torch.from_numpy(y_onehot)
   #y = y_onehot
    
for batch_idx, (x, y) in enumerate(train_iterator):
    print(y)

[2. 2. 2. 1. 0. 0. 2. 0. 1. 2. 0. 2. 0. 1. 0. 1. 1. 2. 0. 0. 0. 2. 1. 0.
 2. 1. 2. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0.
 2. 0. 0. 1. 2. 2. 2. 1. 0. 2. 0. 0. 1. 0. 0. 0.]
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
[1. 0. 2. 1. 0. 2. 1. 2. 1. 1. 0. 0

In [79]:
for i in train_iterator:
    tweet , la = i
    print(tweet)
    break

tensor([[  11,   11,   78,  ...,  925,  233,   30],
        [ 146,  130,   57,  ..., 1899,   25,   14],
        [ 312,   23,   43,  ...,   20,  610, 1287],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])


In [80]:
input_dim = len(TEXT.vocab)
embedding_dim = 300
#len(input_dim)

In [81]:
model = CNNmodel(input_dim, embedding_dim, n_filters, filter_sizes, output_dim, droput)
import torch.optim as optim

optimizer = optim.Adadelta(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)


In [82]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)
print(pretrained_embeddings.shape)
#for name, parameter in model.named_parameters():
 #   print(parameter.shape)

torch.Size([10990, 300])


In [83]:
def binary_accuracy(preds, y):    
    rounded_preds = torch.round(torch.max(preds))
    _, predicted = torch.max(preds.data, 1)
    x = predicted.numpy()
    z = y.numpy()
    #print(predicted)
    #print(y)
    #y = y.numpy() 
    #rounded_preds = rounded_preds.numpy()
    correct = (x == z).astype(int)
    #l = len(y)
    #print("preds")
    #print(rounded_preds)
    #print("onehot")
    #print(y)#convert into float for division 
    acc = correct.sum()/len(correct)
    #print("correct")
    #print(correct)
    return acc

In [96]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
#         print(batch.Tweet.shape)
        
        optimizer.zero_grad()
        
        predictions = model(batch.Tweet).squeeze(1)
#         print(predictions.shape)
        #y_onehot = batch.Stance.numpy()
        #y_onehot = (np.arange(3) == y_onehot[:,None]).astype(np.float32)
        #y_onehot = torch.from_numpy(y_onehot)
#         print(batch.Stance.shape)
        x = batch.Stance.numpy()
        x = x.astype(np.long)
        x = torch.from_numpy(x)
        loss = criterion(predictions, x)
            
        acc = binary_accuracy(predictions, x)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [105]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    count = 0 
    f = 0
    with torch.no_grad():
        #count = 0
        #f = 0
        for batch in iterator:
#             print(batch)
            predictions = model(batch.Tweet).squeeze(1)
            #print(predictions)
            #print(batch.Stance)
            x = batch.Stance.numpy()
            x = x.astype(np.long)
            x = torch.from_numpy(x)    
            loss = criterion(predictions,x)
        
            acc = binary_accuracy(predictions, x)
            count += 1 
            rounded_preds = torch.round(torch.sigmoid(predictions))
            #print(predictions)
            #print(rounded_preds)
            #ftemp = f1_score(batch.Stance, rounded_preds, average = 'macro')
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            #f += ftemp
            count += 1
    #f = f/count        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [106]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    print(epoch)

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    #print(1)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

0
| Epoch: 01 | Train Loss: 0.015 | Train Acc: 99.61% | Val. Loss: 1.688 | Val. Acc: 58.07% |
1
| Epoch: 02 | Train Loss: 0.012 | Train Acc: 99.76% | Val. Loss: 1.648 | Val. Acc: 58.40% |
2
| Epoch: 03 | Train Loss: 0.015 | Train Acc: 99.76% | Val. Loss: 1.649 | Val. Acc: 58.74% |
3
| Epoch: 04 | Train Loss: 0.012 | Train Acc: 99.80% | Val. Loss: 1.734 | Val. Acc: 58.90% |
4
| Epoch: 05 | Train Loss: 0.013 | Train Acc: 99.85% | Val. Loss: 1.798 | Val. Acc: 59.57% |
5
| Epoch: 06 | Train Loss: 0.010 | Train Acc: 99.76% | Val. Loss: 1.695 | Val. Acc: 59.63% |
6
| Epoch: 07 | Train Loss: 0.012 | Train Acc: 99.85% | Val. Loss: 1.720 | Val. Acc: 59.24% |
7
| Epoch: 08 | Train Loss: 0.010 | Train Acc: 99.71% | Val. Loss: 1.927 | Val. Acc: 56.21% |
8
| Epoch: 09 | Train Loss: 0.006 | Train Acc: 99.95% | Val. Loss: 1.733 | Val. Acc: 59.85% |
9
| Epoch: 10 | Train Loss: 0.013 | Train Acc: 99.80% | Val. Loss: 1.859 | Val. Acc: 56.44% |
