In [1]:
import pandas as pd
import numpy as np
import torch
%run DataPrep.ipynb
%run SentimentCNN_model.ipynb
%run TrainTestSentimentCNN.ipynb
from sklearn.model_selection import train_test_split

CUDA is not available.  Training on CPU ...


In [2]:
reviews_data = pd.read_csv('Data/combined_reviews_data.csv', encoding='latin-1', header=0)

### Preprocess and Prepare Data

#### Initial preprocessing

In [3]:
reviews_list = reviews_data['review'].to_list() # get reviews in a list from the pd.Series

In [4]:
# preprocess the reviews in accordance to the preprocess function.
preprocessed_reviews_list = [preprocess(review) for review in reviews_list] 

In [5]:
# get all the reviews together and tokenize the individual words.
tokenized_lemmatized_text = tokenize_lemmatize(' '.join(preprocessed_reviews_list))

#### Encoding words

In [6]:
# creating look up table for encoding words to integers, while getting back their frequency or occurrence.
word_count, vocab_int, int_vocab = create_lookup_tables(tokenized_lemmatized_text)

In [7]:
reviews_ints = []
for review in preprocessed_reviews_list:
    reviews_ints.append([vocab_int[word] for word in tokenize_lemmatize(review)])

#### Padding Sequences

In [12]:
seq_length = 40

features = pad_features(reviews_ints, seq_length)
sentiments = np.array(reviews_data['sentiment'])

In [13]:
split_frac = 0.9

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = sentiments[:split_idx], sentiments[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(67500, 40) 
Validation set: 	(3750, 40) 
Test set: 		(3750, 40)


In [14]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [15]:
dataiter = iter(train_loader)
sample = dataiter.next()

#### Setting Hyperparameter values

In [16]:
# parameters
vocab_size = len(vocab_int) # + 1 for the 0 padding + our word tokens
embedding_size = 300
output_size = 1

In [17]:
# embed = nn.Embedding(vocab_size, embedding_size)
# e = embed(sample[0]).reshape(50, seq_length, embedding_size)
# conv1 = nn.Conv1d(40, 64, 3)
# x = F.relu(conv1(e))
# print("conv1: ", x.shape)
# conv2 = nn.Conv1d(64, 32, 3)
# x = F.relu(conv2(x))
# print("conv2: ",x.shape)
# pool1 = nn.MaxPool1d(3,3)
# x = pool1(F.relu(x))
# print("pool1: ",x.shape)
# conv3 = nn.Conv1d(32, 16, 3)
# x = conv3(x)
# print("conv3: ",x.shape)
# conv4 = nn.Conv1d(16, 8, 3)
# x = conv4(x)
# print("conv4: ",x.shape)
# avgpool = nn.AvgPool1d(94)
# x = avgpool(x)
# print("avgpool: ",x.shape)
# x = x.view(50, -1)
# print("flattened: ", x.shape)
# fc = nn.Linear(8, 1)
# x = fc(x)
# print("final output shape", x.shape)

#### Instantiate model with parameters (currently only working with seq_length = 40, batchsize=50 and output_size=1) 

In [18]:
model = SentimentCNN(vocab_size, output_size, embedding_size, batch_size, seq_length)
print(model)

SentimentCNN(
  (embedding): Embedding(139579, 300)
  (conv1): Conv1d(40, 64, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(64, 32, kernel_size=(3,), stride=(1,))
  (pool1): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 16, kernel_size=(3,), stride=(1,))
  (conv4): Conv1d(16, 8, kernel_size=(3,), stride=(1,))
  (avgpool): AvgPool1d(kernel_size=(94,), stride=(94,), padding=(0,))
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=8, out_features=1, bias=True)
  (sig): Sigmoid()
)


### Train or Test the Model

#### Specifying learning_rate, Loss functions and Optimizer for training the model

In [19]:
# loss and optimization functions
lr=0.001 # learning rate to be used for the optimizer.

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#### Import the train model function from TrainTestSentimentCNN ipynb

##### TRAIN MODEL: UNCOMMENT CELL BELOW TO RUN TRAINING PROCEDURE. 
Current model trains for 5 epochs and saves the model params whenever validation loss hits a min value after an epoch of training. That pre-trained model can then be used for inference.

In [20]:
# train_sentimentCNN(model, train_loader, valid_loader, criterion, optimizer, lr, save_model_as='Sentiment_CNN_v4')

Epoch: 1 	Training Loss: 0.583905 	Validation Loss: 0.466764
Validation loss decreased (inf --> 0.466764).  Saving model ...
Epoch: 2 	Training Loss: 0.422067 	Validation Loss: 0.432903
Validation loss decreased (0.466764 --> 0.432903).  Saving model ...
Epoch: 3 	Training Loss: 0.338804 	Validation Loss: 0.410755
Validation loss decreased (0.432903 --> 0.410755).  Saving model ...
Epoch: 4 	Training Loss: 0.253583 	Validation Loss: 0.385520
Validation loss decreased (0.410755 --> 0.385520).  Saving model ...
Epoch: 5 	Training Loss: 0.179707 	Validation Loss: 0.399031


<All keys matched successfully>

In [21]:
# load the model with the trained parameters/weights.
model.load_state_dict(torch.load('Sentiment_CNN_v4.pt'))

<All keys matched successfully>

##### TEST MODEL: UNCOMMENT CELL BELOW TO RUN TEST PROCEDURE. 

In [2]:
# test_sentimentCNN(model, test_loader, criterion)

### Inference 
Looking at the model's performance against any input text.

In [23]:
new_texts = ["poor quality signal given by the device package did not arrive I am not happy with this", 
             "I actually liked that part of the feature It was surprising in a good way and I will surely go back again"]

In [24]:
def predict(model, new_texts, sequence_length=40):
    
    model.eval()
    
    # tokenize review
    new_texts_ints = [word_to_int(text, vocab_int) for text in new_texts]
    
    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(new_texts_ints, seq_length)
    # make the batch size of the features presented to be teh size the model was trained on. Default = 50.
    if features.shape[0] < 50:
        new_features = np.zeros((50 - features.shape[0], seq_length), dtype=int)
        features = np.concatenate((features, new_features), axis=0)
    else:
        features = features[:50]
    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)
    
    # get the output from the model
    output = model(feature_tensor)
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())
    pred = list(pred[:len(new_texts)])
    sentiment_mapping = {1: "postive", 0: "negative"}
    
    model_response = []
    for i in range(0, len(pred)):
        if pred[i] == 1:
            model_response.append("positive")
        else:
            model_response.append("negative")
    
    response_dict = {}
    for i in range(0, len(pred)):
        response_dict[new_texts[i]] = model_response[i]
    
    return response_dict

In [25]:
predict(model, new_texts)

{'poor quality signal given by the device package did not arrive I am not happy with this': 'negative',
 'I actually liked that part of the feature It was surprising in a good way and I will surely go back again': 'positive'}