In [110]:
import torch
from torch.utils.data import Dataset, DataLoader
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import math
import gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import common_texts, get_tmpfile

# Movie Review Data Cleaning

In [17]:
def read_raw(file_name, label):
  with open(file_name, 'r') as file:
    data = []
    for line in file:
        processed_line = preprocess_raw(line)
        data.append([processed_line.split(), label])
  return data

def preprocess_raw(text):
  text = text.replace('\u202f', ' ').replace('\xa0', ' ')
  out = ''
  for i, char in enumerate(text.lower()):
    if char.isalpha()==True or char==' ':
      out += char
  out=" ".join(out.split())
  out += ' .'
  return out

def build_data(direc):
  data=[]
  counter=0
  for label in ['pos', 'neg']:
    label_direc = os.path.join(direc, label)
    for filename in os.listdir(label_direc):
      if (label == 'pos'):
        data += read_raw(os.path.join(label_direc,str(filename)), 1)
      else:
        data += read_raw(os.path.join(label_direc,str(filename)), 0)
      counter += 1
      print(counter)
  print('Done')
  return data

In [3]:
data = build_data('./drive/My Drive/txt_sentoken/')

FileNotFoundError: [WinError 3] The system cannot find the path specified: './drive/My Drive/txt_sentoken/pos'

# Data to File

In [0]:
data = np.array(data)
file = open('data', 'wb')
pickle.dump(data, file)
file.close()

# Movie Review Dataset

In [8]:
class MovieReviewDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.data = dataset
        self.transform = transform
    
    def __getitem__(self, idx):
        temp = self.data[idx]
        if (self.transform):
            temp = self.transform(sample)
        
        sample = {'sentence': temp[0] ,'label': temp[1]}
        return sample
    
    def __len__(self):
        return len(self.data)            

# Load data file

In [97]:
file = open('data', 'rb')
data = pickle.load(file)
file.close()

dataset = []
for i in data:
    dataset.append([i[0], i[1]])
    
mr = MovieReviewDataset(dataset)
print(len(mr))


mr_dataloader = DataLoader(mr, batch_size = 16, shuffle=True)

64720


In [107]:
test_batch = next(iter(mr_dataloader))
print(test_batch)

{'sentence': [('of', 'there', 'although', 'ricky', 'this', 'hartman', 'nope', 'a', 'after', 'as', 'just', 'tour', 'max', 'her', 'she', 'but'), ('course', 'are', 'godzilla', 'is', 'may', '.', '.', 'strong', 'all', 'far', 'in', 'with', 'fischer', 'husband', 'is', 'theyre')], 'label': tensor([1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1])}


In [167]:
class CRAN(nn.Module):
  def __init__(self,embedding_size,cnn_window_length,hidden_units,p_dropout):
    super(CRAN,self).__init__()

    """
    inputs:
      cnn_num_filters: the number of convolutional kernels, represents the number of output channels parameter in Conv2d
      cnn_window_length: in a kernel size of d*l, d represents the embedding size and l represents the window length
      LSTM_hidden_units: number of hidden units in LSTM layer
      dropout: dropout probability for CNN
      embedding_size: length of embedded word vectors 
    """
    self.cnn=torch.nn.Conv2d(1,hidden_units,(cnn_window_length,embedding_size), padding=(1,0))
    self.dropout=torch.nn.Dropout(p_dropout)
    self.rnn=torch.nn.LSTM(embedding_size,hidden_units,1, batch_first=True)
    self.hidden_size = hidden_units
    self.dense = nn.Linear(hidden_units, 2)
  def forward(self,batch,labels):
    (N,T,d)=batch.shape
    #apply convolutional filters to the input sentences
    cnn_output=self.cnn(batch.unsqueeze(1))
    #cnn_output will be of shape (N,cnn_num_filters,H_out from Pytorch documentation,1)
    shape=cnn_output.shape
    cnn_output= F.relu(cnn_output.view(N,shape[1],shape[2]))
    #average across the different filter outputs
    cnn_output=torch.mean(cnn_output,1)

    h = batch.new_zeros(1, N, self.hidden_size)
    c = batch.new_zeros(1, N, self.hidden_size)

    lstm_out, (h,c) = self.rnn(batch, (h, c))

    s = torch.mean(lstm_out * cnn_output.unsqueeze(2), 1)

    pred = F.log_softmax(self.dense(s))
    
    if (self.training):
      loss = F.nll_loss(pred, labels)

      return loss, pred.argmax(dim=-1)
    else:
      return pred
      

  



In [150]:

class GoogleEmbedding(nn.Module):
  def __init__(self, model_path):
    super(GoogleEmbedding, self).__init__()

    # Load Google's pre-trained Word2Vec model.
    self.model = gensim.models.Word2Vec(size=300, min_count=1)

  def forward(self, batch):
    if (self.training):

    out = []
    for sent in batch:
      print(sent)
      sent_embed = []
      for word in sent:
        sent_embed.append(self.model[word])

      out.append(sent_embed)
    return out

IndentationError: expected an indented block (<ipython-input-150-b60318d07542>, line 11)

In [171]:
a=torch.randn(2,4,150)
labels = torch.tensor([1,0])
model.eval()
model = CRAN(150, 3, 100, .5)
#sents, labels = test_batch['sentence'], test_batch['label']
loss,pred = model(a, labels)
print(loss)

tensor(0.6866, grad_fn=<NllLossBackward>)


In [108]:
embed_model = Word2Vec(size=300, min_count=1)
embed_model.build_vocab(data[:,0])
total_examples = embed_model.corpus_count
pretrained = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
embed_model.build_vocab([list(pretrained.vocab.keys())], update=True)
embed_model.intersect_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True, lockf=1.0)
embed_model.train(data[:,0], total_examples=total_examples, epochs=embed_model.iter)



(5966673, 6767065)

In [113]:
embed_model.save('embedding_model')

In [115]:
a = Word2Vec.load('embedding_model')
print(a['hi'])

KeyboardInterrupt: 

In [116]:
b = KeyedVectors.load('embedding_model.wv.vectors.npy', mmap='r')
b['hi']

UnpicklingError: STACK_GLOBAL requires str