In [2]:
import pandas as pd
import numpy as np
import torch
%run Utils.ipynb
%run models/SentimentCNN_model.ipynb
%run TrainTestSentimentCNN.ipynb
from sklearn.model_selection import train_test_split

CUDA is not available.  Training on CPU ...


In [3]:
reviews_data = pd.read_csv('Data/amazon_review_polarity/train.csv', encoding='latin-1')

In [3]:
reviews_data.columns = ['sentiment', 'title', 'review']

In [4]:
def recode(val):
    if val == 1:
        return 0
    else:
        return 1

In [5]:
reviews_data['sentiment'] = reviews_data['sentiment'].apply(lambda x: recode(x))

In [6]:
pos_reviews = reviews_data[reviews_data['sentiment'] == 1].sample(1000000)
neg_reviews = reviews_data[reviews_data['sentiment'] == 0].sample(1000000)

In [7]:
reviews_data = pd.concat([pos_reviews, neg_reviews], axis=0).sample(frac=1)

In [8]:
reviews_data = reviews_data.dropna()

In [9]:
reviews_data.shape

(1999952, 3)

In [10]:
# # Loading in tensors for Amazon review sub-dataset.
# import torch
# from torch.utils.data import TensorDataset, DataLoader

# ##### Saved data for 200k records, batch_size=250, embedding_size=300, seq_length=30, vocab_size=321951 
# batch_size = 1024
# vocab_size = 720628

# train_data = TensorDataset(torch.load("Data/amazon_review_polarity/Amazon_polarity_subset800k_trainX.pt"), 
#                            torch.load("Data/amazon_review_polarity/Amazon_polarity_subset800k_trainy.pt"))
# train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

# valid_data = TensorDataset(torch.load("Data/amazon_review_polarity/Amazon_polarity_subset800k_valX.pt"),
#                            torch.load("Data/amazon_review_polarity/Amazon_polarity_subset800k_valy.pt"))
# valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

# test_data = TensorDataset(torch.load("Data/amazon_review_polarity/Amazon_polarity_subset800k_testX.pt"), 
#                           torch.load("Data/amazon_review_polarity/Amazon_polarity_subset800k_testy.pt"))
# test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

### Preprocess and Prepare Data

#### Initial preprocessing

In [11]:
 # get reviews in a list from the pd.Series
reviews_list = reviews_data['review'].to_list()

In [12]:
senti_list = reviews_data['sentiment'].to_list()

In [13]:
# preprocess the reviews in accordance to the preprocess function.
preprocessed_reviews_list = [preprocess(review) for review in reviews_list]

In [14]:
# tokenize text and without stemmatizing it to preprocess more
tokenized_text = ' '.join(preprocessed_reviews_list).split() # stem the tokenized words and replace any extra white spaces.

#### Encoding words

In [15]:
# NO LEMMATIZATION: creating look up table for encoding words to integers, while getting back their frequency or occurrence.
word_count, vocab_int, int_vocab = create_lookup_tables(tokenized_text)

In [16]:
import json

with open('Data/amazon_review_polarity/Amazon_polarity_subset2m_vocab_to_int.json', 'w') as f:
    json.dump(vocab_int, f)
        
with open('Data/amazon_review_polarity/Amazon_polarity_subset2m_int_to_vocab.json', 'w') as fp:
    json.dump(int_vocab, fp)

In [17]:
# NO LEMMATIZATION: numerical encoding
reviews_ints = []
for review in preprocessed_reviews_list:
    reviews_ints.append([vocab_int[word] for word in review.split()])

In [18]:
indices_to_drop = [i for i, ints  in enumerate(reviews_ints) if len(ints) == 0]
indices_to_drop

[79221, 131192, 140902, 698488, 768026, 1228268]

In [19]:
reviews_ints = [review for i,review in enumerate(reviews_ints) if i not in indices_to_drop]
senti_list = [senti for i, senti in enumerate(senti_list) if i not in indices_to_drop]

In [20]:
# senti_list = senti_list[:799744] # make it divisible by batch size 250
# reviews_ints = reviews_ints[:799744] # make it divisible by batch size 250
print(len(senti_list))
print(len(reviews_ints))

1999946
1999946


### Padding Sequences

##### Setting up hyperparameters first

In [21]:
review_lens = [len(review) for review in reviews_ints]
review_len_mean = np.array(review_lens).mean()
review_len_std = np.array(review_lens).std()
print(review_len_mean)
print(review_len_std)
print(max(review_lens))

73.39462765494669
41.90018748302791
240


In [22]:
## Parameters
try:
    vocab_size = len(vocab_int) + 1 # for the 0 padding + our word tokens
except:
    vocab_size = vocab_size

embedding_size = 300
# seq_length = int(review_len_mean + 2*(review_len_std))
seq_length = 250
output_size = 1

In [23]:
features = pad_features(reviews_ints, seq_length)
sentiments = np.array(senti_list)

In [24]:
print(sentiments.shape)
print(features.shape)

(1999946,)
(1999946, 250)


In [25]:
split_frac = 0.95

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = sentiments[:split_idx], sentiments[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape)
,
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(1899948, 250) 
Validation set: 	(49999, 250) 
Test set: 		(49999, 250)


In [26]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 4096

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [27]:
torch.save(torch.from_numpy(train_x), "Data/amazon_review_polarity/Amazon_polarity_subset2m_trainX.pt")
torch.save(torch.from_numpy(train_y), "Data/amazon_review_polarity/Amazon_polarity_subset2m_trainy.pt")
torch.save(torch.from_numpy(val_x), "Data/amazon_review_polarity/Amazon_polarity_subset2m_valX.pt")
torch.save(torch.from_numpy(val_y), "Data/amazon_review_polarity/Amazon_polarity_subset2m_valy.pt")
torch.save(torch.from_numpy(test_x), "Data/amazon_review_polarity/Amazon_polarity_subset2m_testX.pt")
torch.save(torch.from_numpy(test_y), "Data/amazon_review_polarity/Amazon_polarity_subset2m_testy.pt")

In [28]:
dataiter = iter(train_loader)
sample = dataiter.next()

In [29]:
sample[0].shape

torch.Size([4096, 250])

#### Setting Hyperparameter values

In [30]:
# embed = nn.Embedding(vocab_size, embedding_size)
# e = embed(sample[0]).reshape(batch_size, seq_length, embedding_size)

# conv = nn.Conv1d(seq_length, 256, 3)
# x = F.relu(conv(e))
# print("init conv: ", conv)

# conv1 = nn.Conv1d(256, 128, 3)
# x = F.relu(conv1(x))
# print("conv1: ", x.shape)

# pool = nn.MaxPool1d(3, 3)
# x = pool1(F.relu(x))
# print("pool: ", x.shape)

# conv2 = nn.Conv1d(128, 64, 3)
# x = F.relu(conv2(x))
# print("conv2: ",x.shape)

# conv3 = nn.Conv1d(64, 32, 3)
# x = conv3(x)
# print("conv3: ",x.shape)

# pool1 = nn.MaxPool1d(3,3)
# x = pool1(F.relu(x))
# print("pool1: ",x.shape)

# conv4 = nn.Conv1d(32, 16, 3)
# x = conv4(x)
# print("conv4: ",x.shape)

# conv5 = nn.Conv1d(16, 8, 3)
# x = conv5(x)
# print("conv5: ", x.shape)

# avgpool = nn.AvgPool1d(27)
# x = avgpool(x)
# print("avgpool: ",x.shape)

# x = x.view(batch_size, -1)
# print("flattened: ", x.shape)

# fc = nn.Linear(8, 1)
# x = fc(x)
# print("final output shape", x.shape)

#### Instantiate model with parameters (currently only working with seq_length = 40, batchsize=50 and output_size=1) 

In [31]:
model = SentimentCNN(vocab_size, output_size, embedding_size, batch_size, seq_length)
print(model)

SentimentCNN(
  (embedding): Embedding(1371176, 300)
  (conv1): Conv1d(250, 64, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(64, 32, kernel_size=(3,), stride=(1,))
  (pool1): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 16, kernel_size=(3,), stride=(1,))
  (conv4): Conv1d(16, 8, kernel_size=(3,), stride=(1,))
  (avgpool): AvgPool1d(kernel_size=(94,), stride=(94,), padding=(0,))
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=8, out_features=1, bias=True)
  (sig): Sigmoid()
)


### Train or Test the Model

#### Specifying learning_rate, Loss functions and Optimizer for training the model

In [32]:
# loss and optimization functions
lr=0.001 # learning rate to be used for the optimizer.

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#### Import the train model function from TrainTestSentimentCNN ipynb

##### TRAIN MODEL: UNCOMMENT CELL BELOW TO RUN TRAINING PROCEDURE. 
Current model trains for 5 epochs and saves the model params whenever validation loss hits a min value after an epoch of training. That pre-trained model can then be used for inference.

In [33]:
train_sentimentCNN(model, train_loader, valid_loader, criterion, optimizer, lr, save_model_as='Sentiment_CNN_subset2m_amazon_pol', n_epochs=5)

  0%|          | 0/5 [00:00<?, ?it/s]

train fail index:  464
Epoch 1 completed in: 102.866  minutes
valid fail index:  13
Epoch: 1 	Training Loss: 0.476961 	Validation Loss: 0.318019
Validation loss decreased (inf --> 0.318019).  Saving model ...


 20%|██        | 1/5 [1:43:20<6:53:21, 6200.31s/it]

train fail index:  464
Epoch 2 completed in: 101.846  minutes
valid fail index:  13
Epoch: 2 	Training Loss: 0.319489 	Validation Loss: 0.289868
Validation loss decreased (0.318019 --> 0.289868).  Saving model ...


 40%|████      | 2/5 [3:25:39<5:08:13, 6164.40s/it]

train fail index:  464
Epoch 3 completed in: 106.063  minutes
valid fail index:  13
Epoch: 3 	Training Loss: 0.292132 	Validation Loss: 0.281954
Validation loss decreased (0.289868 --> 0.281954).  Saving model ...


 60%|██████    | 3/5 [5:12:11<3:28:56, 6268.25s/it]

train fail index:  464
Epoch 4 completed in: 102.120  minutes
valid fail index:  13
Epoch: 4 	Training Loss: 0.278135 	Validation Loss: 0.278976
Validation loss decreased (0.281954 --> 0.278976).  Saving model ...


 80%|████████  | 4/5 [6:54:46<1:43:43, 6223.43s/it]

train fail index:  464
Epoch 5 completed in: 105.971  minutes


100%|██████████| 5/5 [8:41:08<00:00, 6253.67s/it]  

valid fail index:  13
Epoch: 5 	Training Loss: 0.266269 	Validation Loss: 0.288501





In [34]:
# load the model with the trained parameters/weight that performed best in validation.
model.load_state_dict(torch.load('Sentiment_CNN_subset2m_amazon_pol.pt'))

<All keys matched successfully>

##### TEST MODEL: UNCOMMENT CELL BELOW TO RUN TEST PROCEDURE. 

In [35]:
print(seq_length)

250


#### Load amazon test data to test against

In [36]:
test_sentimentCNN(model, test_loader, criterion)

11


('Test Accuracy: 0.8839', 'Precision: 0.8841', 'Recall: 0.8839', 'F1: 0.8838')

### Inference 
Looking at the model's performance against any input text.

In [44]:
def predict(model, new_texts, vocab_to_int, batch_size, seq_length=40):
    """
    Function that takes in text, preproceses and passes it to the model for forward pass.
    Args: 
     - model to perform the inference
     - input text
     - word to integer mapping dict
     - sequence length the text is padded to
    :Returns a score of positive or negative."""
    
    model.eval()
    
    # preprocess, tokenize and lemmatize review
    new_texts = preprocess(new_texts)
    new_texts_ints = word_to_int(new_texts, vocab_to_int, token_lem=True)
    
    # pad tokenized sequence
    features = np.zeros((seq_length), dtype=int)
    if features.shape[0] >= len(new_texts_ints):
        features[seq_length-len(new_texts_ints):] = np.array(new_texts_ints)[:seq_length]
    else:
        features[::] = np.array(new_texts_ints)[:seq_length]
    
    # make the batch size of the features presented to be the size the model was trained on. Default = 50.
    model_input = np.zeros((batch_size, seq_length), dtype=int)
    model_input[0, :] = features
    input_tensor = torch.from_numpy(model_input)
    
    # perform a forward pass from the model
    output = model(input_tensor)

    pred = output.detach().numpy()[0][0]
    
    if pred >= 0.55:
        return ("positive", 2*pred - 1)
    elif pred <= 0.45: 
        return ("negative", 2*pred - 1)
    else:
        return ("unsure/neutral", 2*pred - 1)

##### Apply the prediction function on a pandas dataframe.

In [50]:
test_list = [
"poor quality signal given by the device package did not arrive I am not happy with this"
,'I actually liked that part of the feature It was surprising in a good way and I will surely go back again'
,"I want to speak to a person"
,"Bad service"
,'Broken appliance'
,'Hey, I got a broken item'
,'Hi, an item is missing from my order'
,'Item came broken'
,'My item arrived damaged'
,'My product arrived broken'
, "my delivery is late I want to cancel my order"
,'Poor service!'
,'The delivery was terrible'
,'can I make a complaint to an agent?'
,"I am very happy with the product. It's great!"
,'damaged item'
,'no forget it'
,'I am happy'
,'Cancel my order'
]

In [51]:
import json

with open('Data/amazon_review_polarity/Amazon_polarity_subset2m_vocab_to_int.json', 'r') as vi:
    vocab_to_int = json.load(vi)

In [52]:
for t in test_list:
    print(t, "\n", "Sentiment: ", predict(model, t, vocab_int,batch_size=4096, seq_length=seq_length), "\n")

poor quality signal given by the device package did not arrive I am not happy with this 
 Sentiment:  ('negative', -0.9950658977031708) 

I actually liked that part of the feature It was surprising in a good way and I will surely go back again 
 Sentiment:  ('positive', 0.6080207824707031) 

I want to speak to a person 
 Sentiment:  ('unsure/neutral', 0.02608168125152588) 

Bad service 
 Sentiment:  ('negative', -0.21603453159332275) 

Broken appliance 
 Sentiment:  ('negative', -0.16190427541732788) 

Hey, I got a broken item 
 Sentiment:  ('negative', -0.3867241144180298) 

Hi, an item is missing from my order 
 Sentiment:  ('positive', 0.2554647922515869) 

Item came broken 
 Sentiment:  ('negative', -0.27792930603027344) 

My item arrived damaged 
 Sentiment:  ('positive', 0.18851399421691895) 

My product arrived broken 
 Sentiment:  ('negative', -0.17975521087646484) 

my delivery is late I want to cancel my order 
 Sentiment:  ('negative', -0.3208693861961365) 

Poor service! 
 