In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.autograd import Variable

In [17]:
data = pd.read_csv('./sentiment_analysis/Sentiment Analysis Dataset.csv',error_bad_lines=False)
data = data.sample(frac=1).reset_index(drop=True)
y = np.zeros(data.shape[0])

Skipping line 8836: expected 4 fields, saw 5

Skipping line 535882: expected 4 fields, saw 7



In [18]:
data.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1219540,0,Sentiment140,"Twitter keeps deleting my updatesss And yes, ..."
1,1492898,1,Sentiment140,making plans for a lovely day tommorow bubble...
2,1237195,1,Sentiment140,That was the quickest work day of my life. And...
3,174537,1,Sentiment140,@dropdeadvictor tï¿½ sim..
4,769329,1,Sentiment140,HEY GUYS EVERYONE ADD @MattWayneCeleb please a...


In [22]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(data, y,test_size=0.2)

In [23]:
X_test.Sentiment.value_counts()

0    157969
1    157754
Name: Sentiment, dtype: int64

In [24]:
X_train_valid.Sentiment.value_counts()

1    632423
0    630466
Name: Sentiment, dtype: int64

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid,test_size=0.2)

In [26]:
X_train.Sentiment.value_counts()

1    505950
0    504361
Name: Sentiment, dtype: int64

In [27]:
X_valid.Sentiment.value_counts()

1    126473
0    126105
Name: Sentiment, dtype: int64

In [28]:
X_train.to_csv('./sentiment_analysis/train_dataset.csv',index=False)
X_test.to_csv('./sentiment_analysis/test_dataset.csv',index=False)
X_valid.to_csv('./sentiment_analysis/valid_dataset.csv',index=False)

In [30]:
del X_train,X_test, X_train_valid,y_train,y_test,y_train_valid

----------------------------

In [46]:
x_train = pd.read_csv('sentiment_analysis/train_dataset.csv')

In [4]:
from gensim.models import word2vec
from nltk import word_tokenize,sent_tokenize
import re
import tqdm
import json
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',    level=logging.INFO)

In [56]:
def prepare_data_word2vec(x):
    sent = []
    out = x.strip()
    out= "".join(i for i in out if ord(i)<128)
    out = sent_tokenize(out)
    for i in out:
        t = word_tokenize(re.sub(r"[^a-z0-9]+", " ", i.lower()))
        sent.append(t)
    return sent

In [59]:
sentences = []
for i in tqdm.tqdm_notebook(x_train.SentimentText):
    sentences.append(prepare_data_word2vec(i))
    #except UnicodeDecodeError :
    #    print i.encode('utf-8')
    #    break

A Jupyter Widget




In [62]:
sents = [item for sublist in sentences for item in sublist]

In [63]:
len(sents)

1735107

In [72]:
w2v_model = word2vec.Word2Vec(sentences=sents, size=100, window=5,iter=10, min_count=5)

In [84]:
w2v_model.most_similar("lol")

  """Entry point for launching an IPython kernel.


[('lmao', 0.8368069529533386),
 ('haha', 0.8177106380462646),
 ('hahaha', 0.6593310832977295),
 ('lmfao', 0.6513242125511169),
 ('hehe', 0.6214161515235901),
 ('lolz', 0.6208990812301636),
 ('hahah', 0.6194578409194946),
 ('cuz', 0.6178313493728638),
 ('rofl', 0.603983461856842),
 ('lmaoo', 0.5976788401603699)]

In [76]:
#### Save weights
weights = w2v_model.wv.syn0
np.save(open('./sentiment_analysis/w2v_weights', 'wb'), weights)

  


In [80]:
vocab = dict([(k, v.index) for k, v in w2v_model.wv.vocab.items()])
with open('./sentiment_analysis/w2v_vocab', 'w') as f:
    f.write(json.dumps(vocab))

In [81]:
### Save word2vec model
w2v_model.init_sims(replace=True)
w2v_model.save('./sentiment_analysis/w2v_model')

--------------------

In [2]:
weights = np.load('./sentiment_analysis/w2v_weights')

In [5]:
def load_vocab(vocab_path):
    with open(vocab_path, 'r') as f:
        data = json.loads(f.read())
    word2idx = data
    idx2word = dict([(v, k) for k, v in data.items()])
    return word2idx, idx2word

word2id,id2word = load_vocab('./sentiment_analysis/w2v_vocab')

In [6]:
def prepare_data(x):
    max_len = 35
    sent = []
    out = x.strip()
    out= "".join(i for i in out if ord(i)<128)
    out = word_tokenize(re.sub(r"[^a-z0-9]+", " ", out.lower()))
    out_ = []
    for i in out:
        try:
            out_.append(word2id[i])
        except:
            continue
    padded = np.zeros(max_len, dtype=np.int64)
    leng = len(out_)
    if leng > max_len:
        padded = out_[:max_len]
    else:
        #print out_
        padded[(max_len-leng):]=out_
    return padded

In [79]:
class CustomDataset():
    
    def __init__(self,filepath):
        self.data = pd.read_csv(filepath)
        self.texts = self.data.iloc[:,3]
        self.labels = self.data.iloc[:,1]
        
    def __getitem__(self,index):
        text = self.texts[index]
        label = self.labels[index]
        text = np.array(prepare_data(text))
        label = np.array(label).reshape(-1)
        return text, label
        
    def __len__(self):
        return (self.data.shape[0])

In [80]:
train_loader = DataLoader(CustomDataset('./sentiment_analysis/train_dataset.csv'),
                         batch_size=64,
                         shuffle=False)
valid_loader = DataLoader(CustomDataset('./sentiment_analysis/valid_dataset.csv'),
                         batch_size=64,
                         shuffle=True)

test_loader = DataLoader(CustomDataset('./sentiment_analysis/test_dataset.csv'),
                         batch_size=64,
                         shuffle=True)

In [81]:
for i in train_loader:
    print i[0].shape
    print i[1].shape
    break

torch.Size([64, 35])
torch.Size([64, 1])


In [82]:
data_loaders = {"train": train_loader, "val": valid_loader}
data_lengths = {"train": train_loader.dataset.data.shape[0], "val": train_loader.dataset.data.shape[0]}

In [177]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    #emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    #emb_layer.load_state_dict({'weight': weights_matrix})
    weights_matrix = torch.FloatTensor(weights_matrix)
    emb_layer = nn.Embedding.from_pretrained(weights_matrix)
    if non_trainable:
        emb_layer.weight.requires_grad = False
    else:
        emb_layer.weight.requires_grad = True
        
    return emb_layer, num_embeddings, embedding_dim

class LSTM(nn.Module):
    
    def __init__(self,weights_matrix,num_out):
        super(LSTM,self).__init__()
        # define all the components that will be used in the NN (these can be reused)
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, False)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=100,batch_first =True)
        self.fc1 = nn.Linear(100,64)
        self.fc2 = nn.Linear(64,num_out)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self,x):
        #print type(x)
        #print x.shape
        out = self.embedding(x)
        #print out.shape
        out,(hn, cn) = self.lstm(out)
        out = self.fc1(out[:,-1,:])
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [178]:
lstm = LSTM(weights,1)
print lstm

LSTM(
  (embedding): Embedding(57411, 100)
  (lstm): LSTM(100, 100, batch_first=True)
  (fc1): Linear(in_features=100, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)


In [179]:
def trainable_params_(m):
    return [p for p in m.parameters() if p.requires_grad]

In [180]:
len(list(lstm.parameters()))

9

In [181]:
for name, param in lstm.named_parameters():
    if param.requires_grad:
        print name, param.size()

embedding.weight torch.Size([57411, 100])
lstm.weight_ih_l0 torch.Size([400, 100])
lstm.weight_hh_l0 torch.Size([400, 100])
lstm.bias_ih_l0 torch.Size([400])
lstm.bias_hh_l0 torch.Size([400])
fc1.weight torch.Size([64, 100])
fc1.bias torch.Size([64])
fc2.weight torch.Size([1, 64])
fc2.bias torch.Size([1])


In [182]:
for name, param in lstm.named_parameters():
    if param.requires_grad:
        if name == 'lstm.bias_ih_l0':
            fc1_w = param.data

In [183]:
fc1_w.size()

torch.Size([400])

In [184]:
torch.mean(fc1_w)

tensor(1.00000e-03 *
       1.8560)

In [185]:
torch.std(fc1_w)

tensor(1.00000e-02 *
       5.9771)

In [186]:
def variable_summaries(var):
    mean = torch.mean(var)
    stddev = torch.std(var)
    maximum = torch.max(var)
    minimum = torch.min(var)
    return mean, stddev, maximum, minimum

In [187]:
variable_summaries(fc1_w)

(tensor(1.00000e-03 *
        1.8560), tensor(1.00000e-02 *
        5.9771), tensor(1.00000e-02 *
        9.9816), tensor(1.00000e-02 *
        -9.9938))

In [188]:
for param in lstm.parameters():
    print(type(param.data), param.size())

(<class 'torch.Tensor'>, torch.Size([57411, 100]))
(<class 'torch.Tensor'>, torch.Size([400, 100]))
(<class 'torch.Tensor'>, torch.Size([400, 100]))
(<class 'torch.Tensor'>, torch.Size([400]))
(<class 'torch.Tensor'>, torch.Size([400]))
(<class 'torch.Tensor'>, torch.Size([64, 100]))
(<class 'torch.Tensor'>, torch.Size([64]))
(<class 'torch.Tensor'>, torch.Size([1, 64]))
(<class 'torch.Tensor'>, torch.Size([1]))


In [189]:
lstm.parameters()

<generator object parameters at 0x7f6967d7b370>

In [190]:
criterion = nn.BCELoss()
#optimizer = torch.optim.Adam(trainable_params_(lstm), lr=0.001)
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)

In [191]:
from tensorboardX import SummaryWriter

In [192]:
!rm ./sentiment_analysis/tensorboard/*

In [193]:
writer = SummaryWriter('./sentiment_analysis/tensorboard')
#writer2 = SummaryWriter('./sentiment_analysis/tensorboard')

In [194]:
num_epoch =5
train_losses = []
valid_losses = []
steps = 1

for epoch in range( num_epoch):
    
    print('Epoch {}/{}'.format(epoch, num_epoch- 1))
    print('-' * 10)
    
    # Each epoch has a training and validation phase
    for phase in ['train', 'val']:
        if phase == 'train':
            lstm.train() # Set model to training mode
        else:
            lstm.eval() # Set model to evaluate mode
        
        running_loss = 0.0
        # Iterate over data.
        
        for i, (texts,labels) in enumerate(data_loaders[phase]):
            # torch can only train on Variable, so convert them to Variable
            texts = Variable(texts)
            labels = Variable(labels)

            outputs = lstm(texts) # Forward pass: compute the output class given a image
            loss = criterion(outputs,labels.float()) # Compute the loss: difference between the output class and the pre-given label
            
            ##########Tensorboard#############################
            ##################################################
            if phase=='train':
                writer.add_scalar("loss",loss, steps)
                for name, param in lstm.named_parameters():
                    mean_,stddev_,max_,min_ = variable_summaries(param.data)
                    writer.add_scalar(name+ "_mean",mean_, steps)
                    writer.add_scalar(name+ "_stddev",stddev_, steps)
                    writer.add_scalar(name+ "_max",max_, steps)
                    writer.add_scalar(name+ "_min",min_, steps)
                    #writer.add_histogram(name + "_hist",param.data.numpy(),steps)
                steps+=1
                
            
            ###################################################
            ###################################################
            optimizer.zero_grad() # clear gradients for next train
            if phase == 'train':
                loss.backward() # backpropagation, compute gradients
                optimizer.step() # apply gradients  and update the weights of hidden nodes
                
            running_loss += loss.data * texts.size(0)
            
            if phase == 'train':
                if (i+1) % 100 == 0 :
                    print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %(epoch+1, num_epoch, i+1, data_lengths[phase]//64, loss.data))
                    
        epoch_loss = running_loss / data_lengths[phase]
        if phase =='train':
            train_losses.append(epoch_loss)
            writer.add_scalar("total_train_loss",epoch_loss, epoch+1)
        if phase =='val':
            valid_losses.append(epoch_loss)
            
            #####################Tensorboard###################################
            ###################################################################
            writer.add_scalar("total/valid_loss",epoch_loss, epoch+1)
            for name, param in lstm.named_parameters():
                    mean_,stddev_,max_,min_ = variable_summaries(param.data)
                    writer.add_scalar("total/"+name+ "_mean",mean_, epoch+1)
                    writer.add_scalar("total/"+name+ "_stddev",stddev_, epoch+1)
                    writer.add_scalar("total/"+name+ "_max",max_, epoch+1)
                    writer.add_scalar("total/"+name+ "_min",min_, epoch+1)
                    writer.add_histogram("total/"+name + "_hist",param.data.numpy(),epoch+1)
        print('Epoch [{}/{}]{} Loss: {:.4f}'.format(epoch+1, num_epoch,phase, epoch_loss))    

Epoch 0/4
----------
Epoch [1/5], Step [100/15786], Loss: 0.5198
Epoch [1/5], Step [200/15786], Loss: 0.2418
Epoch [1/5], Step [300/15786], Loss: 0.3028
Epoch [1/5], Step [400/15786], Loss: 0.3268
Epoch [1/5], Step [500/15786], Loss: 0.2768
Epoch [1/5], Step [600/15786], Loss: 0.3599
Epoch [1/5], Step [700/15786], Loss: 0.2390
Epoch [1/5], Step [800/15786], Loss: 0.2462
Epoch [1/5], Step [900/15786], Loss: 0.2047
Epoch [1/5], Step [1000/15786], Loss: 0.2508
Epoch [1/5], Step [1100/15786], Loss: 0.2929
Epoch [1/5], Step [1200/15786], Loss: 0.1269
Epoch [1/5], Step [1300/15786], Loss: 0.2315
Epoch [1/5], Step [1400/15786], Loss: 0.2264
Epoch [1/5], Step [1500/15786], Loss: 0.2999
Epoch [1/5], Step [1600/15786], Loss: 0.2365
Epoch [1/5], Step [1700/15786], Loss: 0.3318
Epoch [1/5], Step [1800/15786], Loss: 0.3992
Epoch [1/5], Step [1900/15786], Loss: 0.2566
Epoch [1/5], Step [2000/15786], Loss: 0.3147
Epoch [1/5], Step [2100/15786], Loss: 0.3276
Epoch [1/5], Step [2200/15786], Loss: 0.271

Epoch [2/5], Step [2300/15786], Loss: 0.2619
Epoch [2/5], Step [2400/15786], Loss: 0.2927
Epoch [2/5], Step [2500/15786], Loss: 0.2610
Epoch [2/5], Step [2600/15786], Loss: 0.2182
Epoch [2/5], Step [2700/15786], Loss: 0.2429
Epoch [2/5], Step [2800/15786], Loss: 0.3634
Epoch [2/5], Step [2900/15786], Loss: 0.2506
Epoch [2/5], Step [3000/15786], Loss: 0.3311
Epoch [2/5], Step [3100/15786], Loss: 0.3164
Epoch [2/5], Step [3200/15786], Loss: 0.4507
Epoch [2/5], Step [3300/15786], Loss: 0.2713
Epoch [2/5], Step [3400/15786], Loss: 0.3180
Epoch [2/5], Step [3500/15786], Loss: 0.3495
Epoch [2/5], Step [3600/15786], Loss: 0.2615
Epoch [2/5], Step [3700/15786], Loss: 0.3603
Epoch [2/5], Step [3800/15786], Loss: 0.2140
Epoch [2/5], Step [3900/15786], Loss: 0.1994
Epoch [2/5], Step [4000/15786], Loss: 0.3221
Epoch [2/5], Step [4100/15786], Loss: 0.2350
Epoch [2/5], Step [4200/15786], Loss: 0.2107
Epoch [2/5], Step [4300/15786], Loss: 0.3136
Epoch [2/5], Step [4400/15786], Loss: 0.3863
Epoch [2/5

Epoch [3/5], Step [4600/15786], Loss: 0.2052
Epoch [3/5], Step [4700/15786], Loss: 0.3359
Epoch [3/5], Step [4800/15786], Loss: 0.2469
Epoch [3/5], Step [4900/15786], Loss: 0.2582
Epoch [3/5], Step [5000/15786], Loss: 0.2751
Epoch [3/5], Step [5100/15786], Loss: 0.1897
Epoch [3/5], Step [5200/15786], Loss: 0.2502
Epoch [3/5], Step [5300/15786], Loss: 0.3465
Epoch [3/5], Step [5400/15786], Loss: 0.3366
Epoch [3/5], Step [5500/15786], Loss: 0.4795
Epoch [3/5], Step [5600/15786], Loss: 0.2720
Epoch [3/5], Step [5700/15786], Loss: 0.2044
Epoch [3/5], Step [5800/15786], Loss: 0.1939
Epoch [3/5], Step [5900/15786], Loss: 0.2500
Epoch [3/5], Step [6000/15786], Loss: 0.3180
Epoch [3/5], Step [6100/15786], Loss: 0.2692
Epoch [3/5], Step [6200/15786], Loss: 0.1732
Epoch [3/5], Step [6300/15786], Loss: 0.2901
Epoch [3/5], Step [6400/15786], Loss: 0.4410
Epoch [3/5], Step [6500/15786], Loss: 0.2406
Epoch [3/5], Step [6600/15786], Loss: 0.1949
Epoch [3/5], Step [6700/15786], Loss: 0.2428
Epoch [3/5

Epoch [4/5], Step [6900/15786], Loss: 0.1952
Epoch [4/5], Step [7000/15786], Loss: 0.2663
Epoch [4/5], Step [7100/15786], Loss: 0.3083
Epoch [4/5], Step [7200/15786], Loss: 0.1643
Epoch [4/5], Step [7300/15786], Loss: 0.3052
Epoch [4/5], Step [7400/15786], Loss: 0.1959
Epoch [4/5], Step [7500/15786], Loss: 0.1156
Epoch [4/5], Step [7600/15786], Loss: 0.1624
Epoch [4/5], Step [7700/15786], Loss: 0.1954
Epoch [4/5], Step [7800/15786], Loss: 0.2663
Epoch [4/5], Step [7900/15786], Loss: 0.2692
Epoch [4/5], Step [8000/15786], Loss: 0.2899
Epoch [4/5], Step [8100/15786], Loss: 0.2858
Epoch [4/5], Step [8200/15786], Loss: 0.2006
Epoch [4/5], Step [8300/15786], Loss: 0.1971
Epoch [4/5], Step [8400/15786], Loss: 0.2395
Epoch [4/5], Step [8500/15786], Loss: 0.1865
Epoch [4/5], Step [8600/15786], Loss: 0.1667
Epoch [4/5], Step [8700/15786], Loss: 0.1825
Epoch [4/5], Step [8800/15786], Loss: 0.3348
Epoch [4/5], Step [8900/15786], Loss: 0.3402
Epoch [4/5], Step [9000/15786], Loss: 0.3465
Epoch [4/5

Epoch [5/5], Step [9200/15786], Loss: 0.3657
Epoch [5/5], Step [9300/15786], Loss: 0.0821
Epoch [5/5], Step [9400/15786], Loss: 0.2301
Epoch [5/5], Step [9500/15786], Loss: 0.2694
Epoch [5/5], Step [9600/15786], Loss: 0.3423
Epoch [5/5], Step [9700/15786], Loss: 0.1726
Epoch [5/5], Step [9800/15786], Loss: 0.1893
Epoch [5/5], Step [9900/15786], Loss: 0.1726
Epoch [5/5], Step [10000/15786], Loss: 0.0788
Epoch [5/5], Step [10100/15786], Loss: 0.2667
Epoch [5/5], Step [10200/15786], Loss: 0.1158
Epoch [5/5], Step [10300/15786], Loss: 0.1373
Epoch [5/5], Step [10400/15786], Loss: 0.1821
Epoch [5/5], Step [10500/15786], Loss: 0.2050
Epoch [5/5], Step [10600/15786], Loss: 0.2723
Epoch [5/5], Step [10700/15786], Loss: 0.2395
Epoch [5/5], Step [10800/15786], Loss: 0.2609
Epoch [5/5], Step [10900/15786], Loss: 0.2562
Epoch [5/5], Step [11000/15786], Loss: 0.1814
Epoch [5/5], Step [11100/15786], Loss: 0.1248
Epoch [5/5], Step [11200/15786], Loss: 0.1639
Epoch [5/5], Step [11300/15786], Loss: 0.2

In [213]:
lstm.eval()
correct = 0
total = 0
for texts, labels in test_loader:
    labels = labels.float()
    texts = Variable(texts)
    outputs = lstm(texts)
    predicted = (outputs.data >0.5).float()
    total += labels.size(0)
    correct += (predicted == labels).sum()

print('Test Accuracy of the model on the 315723 test texts: %.4f %%' % (100 * correct / total))

Test Accuracy of the model on the 315723 test texts: 80.0000 %
