In [27]:
import pandas as pd
import numpy as np
from collections import Counter
import statistics
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

#!conda install pytorch torchvision cpuonly -c pytorch
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

In [2]:
train = pd.read_csv('comments_preprocessed_1.csv', nrows = 2000, index_col = 'id')
X = train.comment_text
y = train.target

In [3]:
train.head(5)

Unnamed: 0_level_0,comment_text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
59848,this be so cool -PRON- be like ' would -PRON- ...,0
59849,thank -PRON- this would make -PRON- life a lot...,0
59852,this be such an urgent design problem kudo to ...,0
59855,be this something i will be able to install on...,0
59856,haha -PRON- guy be a bunch of loser,1


In [4]:
vectorizer = CountVectorizer(stop_words = 'english', lowercase = True, ngram_range = (1,1))
X_train_vec = vectorizer.fit_transform(X)
embedding = X_train_vec.toarray()
embedding.shape

(2000, 8104)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(embedding, np.array(y), test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1600, 8104) (400, 8104) (1600,) (400,)


In [34]:
# create TensorDatasets
trainset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
testset = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# dataloaders
batch_size = 20 
trainLoader = DataLoader(trainset, shuffle=True, batch_size=batch_size)
testLoader = DataLoader(testset, shuffle=True, batch_size=batch_size)

In [35]:
# obtain one batch of training data
dataiter = iter(trainLoader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([20, 8104])
Sample input: 
 tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

Sample label size:  torch.Size([20])
Sample label: 
 tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [19]:
class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
#         if (train_on_gpu):
#             hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
#                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
#         else:
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [35]:
embedding.shape[1]

8104

In [38]:
# instantiate the network

# vocab_to_int;Size of our vocabulary;+1 for the 0 padding
n_vocab = embedding.shape[1] +1

# Size of our desired output; the number of class scores we want to output (toxic/non-toxic)
output_size = 1

# Number of columns in the embedding lookup table; size of our embeddings.
embedding_dim = embedding.shape[1] # The number of expected features in the input x

hidden_dim = 128 # number of features in the hidden state h
n_layers = 2 # Number of recurrent layers

net = SentimentRNN(n_vocab, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(2000, 2000)
  (lstm): LSTM(2000, 128, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [39]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [None]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

In [None]:
# training params

epochs = 3 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# # move model to GPU, if available
# if(train_on_gpu):
#     net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in trainLoader:
        counter += 1

#         if(train_on_gpu):
#             inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in testLoader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

#                 if(train_on_gpu):
#                     inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 2/3... Step: 100... Loss: 0.167544... Val Loss: 0.241718


reference:
https://github.com/lamiaehana/Projects/tree/master/Sentiment_Analysis_rnn_LSTM