# Deep Continuous Bag of Words

![alt text](src/arch.jpg)

source: https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf

In [1]:
# Import necessary modules

import numpy as np
import pandas as pd
import re

from tqdm import tqdm
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data_utils

In [2]:
# Create word2vec dictionary

def load_word_vectors(filename):
    word_to_index = {}
    word_vectors  = []
    
    with open(filename, encoding="utf8") as fp:
        for line in tqdm(fp.readlines(), leave = False):
            line = line.split(" ")
            
            word = line[0]
            word_to_index[word] = len(word_to_index)
            
            vec = np.array([float(x) for x in line[1:]])
            word_vectors.append(vec)
            
    return word_to_index, word_vectors

word_to_index, word_vectors = load_word_vectors("model/glove.6B.100d.txt")

                                                                                                                       

In [3]:
# Import csv file of amazon reviews

df = pd.read_csv("data/amazon_reviews_small.csv", names = ['sentiment', 'title', 'content'])
df.head()

Unnamed: 0,sentiment,title,content
0,2,Right on the money,We are using the this book to get 100+ certifi...
1,2,Serves its Purpose!,Couldn't go without it. My 3 1/2 year still we...
2,2,Trailer Park Bwoys!!!,we get to see it on paramount in ol' LND UK an...
3,1,buyer beware,There are companies selling Bosch knock-offs o...
4,2,Great for those cold winters,If you are looking to keep your water liquifie...


In [4]:
def get_vectors(phrase):
    phrase = phrase.split(' ')
    vector = np.zeros((1,100))
    word_count = 0
    for word in phrase:
        try:
            vector += np.asarray(word_vectors[word_to_index[word]])
            word_count += 1
        except:
            pass
    
    if word_count > 0:
        ans = vector/word_count
    else:
        ans = np.zeros((1,100))
    
    return ans
    
def clean_phrase(phrase):
    phrase = phrase.lower()
    phrase = phrase.replace("'", "")
    phrase = phrase.replace(".", "")
    phrase = re.sub(r'([^\s\w]|_)+', '', phrase)
    return phrase

def generate_phrases(dataset):
    phrases = dataset[['title', 'content']].dropna().values.flatten().tolist()
    return phrases
    
def generate_tfidf(phrases):
    tfidf = TfidfVectorizer()
    tfidf.fit(phrases)
    return tfidf

def phrase_cbow(a, b):
    aa = a.reshape(1,-1)
    ba = b.reshape(1,-1)
    
    cos_lib = cosine_similarity(aa, ba)

    return cos_lib

In [5]:
df['sentiment']     = [(a-1) for a in df.sentiment.values]
df['title_clean']   = [clean_phrase(str(a)) for a in df.title]
df['content_clean'] = [clean_phrase(str(a)) for a in df.content]

In [6]:
tfidf = generate_tfidf(generate_phrases(df))

In [7]:
print("Generating clean title")
df['title_rep']     = [tfidf.transform([a]) for a in df.title_clean]
print("Generating clean content")
df['content_rep']   = [tfidf.transform([a]) for a in df.content_clean]

Generating clean title
Generating clean content


In [8]:
print("Generating clean title vector")
df['title_vec']     = [get_vectors(a) for a in df.title_clean]
print("Generating clean content vector")
df['content_vec']   = [get_vectors(a) for a in df.content_clean]

Generating clean title vector
Generating clean content vector


In [9]:
df['cosine']  = [phrase_cbow(get_vectors(a), get_vectors(b)) for a,b in zip(df.title_clean, df.content_clean)]

In [10]:
df['feature'] = [np.hstack((a,b,c)).reshape(1,-1) for a,b,c in df[['title_vec', 'content_vec', 'cosine']].values]

In [11]:
df_set = df.drop(columns = ['title_clean', 'content_clean', 'title_rep', 'content_rep', 'title_vec', 'content_vec', 'cosine'])
df_set.head()

Unnamed: 0,sentiment,title,content,feature
0,1,Right on the money,We are using the this book to get 100+ certifi...,"[[0.11201899999999998, 0.03991249999999999, 0...."
1,1,Serves its Purpose!,Couldn't go without it. My 3 1/2 year still we...,"[[-0.08785333333333334, -0.022782999999999998,..."
2,1,Trailer Park Bwoys!!!,we get to see it on paramount in ol' LND UK an...,"[[0.23044, -0.04867500000000001, 0.431965, -0...."
3,0,buyer beware,There are companies selling Bosch knock-offs o...,"[[0.10213000000000001, 0.18035, 0.790140000000..."
4,1,Great for those cold winters,If you are looking to keep your water liquifie...,"[[-0.3699188, 0.3999688, 0.37708379999999997, ..."


In [12]:
pos_sent = df.loc[df['sentiment'] == 1]
neg_sent = df.loc[df['sentiment'] != 1]

pos_sent['sentiment'] = [[0, 1] for a in pos_sent['sentiment'].values]
neg_sent['sentiment'] = [[1, 0] for a in neg_sent['sentiment'].values]

pos_temp = pos_sent.sample(frac = 1)
neg_temp = neg_sent.sample(frac = 1)

pos_points = len(pos_temp)
neg_points = len(neg_temp)

train_pos = pos_temp.iloc[:int(pos_points*.7)]
train_neg = neg_temp.iloc[:int(neg_points*.7)]
train     = pd.concat([train_pos, train_neg])

valid_pos = pos_temp.iloc[int(pos_points*.7):int(pos_points*.85)]
valid_neg = neg_temp.iloc[int(neg_points*.7):int(neg_points*.85)]
valid     = pd.concat([valid_pos, valid_neg])

test_pos  = pos_temp.iloc[int(pos_points*.85):]
test_neg  = neg_temp.iloc[int(neg_points*.85):]
test      = pd.concat([test_pos, test_neg])

#pd.concat([a, b])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [13]:
np.concatenate(np.asarray(train.feature.values))

array([[ 0.1086565 , -0.2171745 ,  0.4474775 , ...,  0.55243221,
         0.3964709 ,  0.60546891],
       [-0.0880905 ,  0.3306007 ,  0.5111311 , ...,  0.50290921,
         0.38442631,  0.95579906],
       [-0.046564  ,  0.635425  , -0.0936    , ...,  0.53612823,
         0.21545359,  0.75013627],
       ...,
       [-0.218092  ,  0.1300008 , -0.002624  , ...,  0.48524207,
         0.23693102,  0.818071  ],
       [-0.11760667,  0.41518   ,  0.69020667, ...,  0.45763913,
         0.17676815,  0.73917921],
       [ 0.01119   ,  0.02407917,  0.28859146, ...,  0.44743207,
         0.2434048 ,  0.86634793]])

In [14]:
train_tensor = data_utils.TensorDataset(torch.tensor(np.concatenate(np.asarray(train.feature.values), axis = 0)).float(), 
                                        torch.tensor(np.vstack(train.sentiment.values)))
valid_tensor = data_utils.TensorDataset(torch.tensor(np.concatenate(np.asarray(valid.feature.values), axis = 0)).float(), 
                                        torch.tensor(np.vstack(valid.sentiment.values)))
test_tensor  = data_utils.TensorDataset(torch.tensor(np.concatenate(np.asarray(test.feature.values), axis = 0)).float(), 
                                        torch.tensor(np.vstack(test.sentiment.values)))

In [15]:
batch_size = 32

train_loader = data_utils.DataLoader(train_tensor, batch_size = batch_size, shuffle = True)
valid_loader = data_utils.DataLoader(valid_tensor, batch_size = batch_size, shuffle = True)
test_loader = data_utils.DataLoader(test_tensor, batch_size = batch_size, shuffle = True)

In [16]:
train_on_gpu = torch.cuda.is_available()

model = nn.Sequential(nn.Linear(201, 100),
                      nn.ReLU(),
                      nn.Linear(100, 50),
                      nn.ReLU(),
                      nn.Linear(50, 2),
                      nn.Softmax(dim=1))

if train_on_gpu:
    model.cuda()

criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.03)

In [17]:
epochs = 10
valid_min = np.Inf
for e in range(epochs):
    running_loss_train = 0
    model.train
    for features, labels in train_loader:
        if train_on_gpu:
            features, labels = features.cuda(), labels.cuda()
        optimizer.zero_grad()
        output = model(features)
        loss = criterion(output, labels.float())
        loss.backward()
        optimizer.step()
        
        running_loss_train += loss.item()

        
    running_loss_valid = 0
    model.eval
    for features, labels in train_loader:
        if train_on_gpu:
            features, labels = features.cuda(), labels.cuda()
        output = model(features)
        loss = criterion(output, labels.float())
        
        running_loss_valid += loss.item()

        
    print(f"Epoch {e}    Training loss: {running_loss_train/len(train_loader):.6f}\tValidation loss: {running_loss_valid/len(train_loader):.6f}")
    
    if running_loss_valid < valid_min:
        print("Validation loss decreased. Saving model...")
        torch.save(model.state_dict(), 'deepcbow.pt')
        valid_min = running_loss_valid

Epoch 0    Training loss: 0.497394	Validation loss: 0.415503
Validation loss decreased. Saving model...
Epoch 1    Training loss: 0.421199	Validation loss: 0.436148
Epoch 2    Training loss: 0.402765	Validation loss: 0.420230
Epoch 3    Training loss: 0.391127	Validation loss: 0.380647
Validation loss decreased. Saving model...
Epoch 4    Training loss: 0.379337	Validation loss: 0.366653
Validation loss decreased. Saving model...
Epoch 5    Training loss: 0.372418	Validation loss: 0.356158
Validation loss decreased. Saving model...
Epoch 6    Training loss: 0.365166	Validation loss: 0.356578
Epoch 7    Training loss: 0.359796	Validation loss: 0.346152
Validation loss decreased. Saving model...
Epoch 8    Training loss: 0.355292	Validation loss: 0.344206
Validation loss decreased. Saving model...
Epoch 9    Training loss: 0.350383	Validation loss: 0.352823


In [18]:
test_loss = 0.0
model.eval

model.load_state_dict(torch.load('deepcbow.pt'))

correct = 0
total = 0

model.cuda()

for features, labels in test_loader:
    if train_on_gpu:
        features, labels = features.cuda(), labels.cuda()
    output = model(features)
    _, pred = torch.max(output, 1)
    _, true = torch.max(labels, 1)
    loss = criterion(output, labels.float())
    test_loss += loss.item()
    for a, b in zip(pred, true):
        if a.item() == b.item():
            correct += 1
        total += 1

print(f"Model test loss is {test_loss/len(test_loader):.6f}.")
print(f"Model got {correct} out of {total} correct.")
print(f"Model test accuracy: {correct*100/total:.2f}%")

Model test loss is 0.359139.
Model got 12621 out of 15000 correct.
Model test accuracy: 84.14%
