In [0]:
import torch
from torch.utils.data import Dataset, DataLoader
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import math

# Movie Review Data Cleaning

In [0]:
def read_raw(file_name, label):
  with open(file_name, 'r') as file:
    data = []
    for line in file:
        processed_line = preprocess_raw(line)
        data.append([processed_line.split(), label])
  return data

def preprocess_raw(text):
  text = text.replace('\u202f', ' ').replace('\xa0', ' ')
  out = ''
  for i, char in enumerate(text.lower()):
    if char.isalpha()==True or char==' ':
      out += char
  out=" ".join(out.split())
  out += ' .'
  return out

def build_data(direc):
  data=[]
  counter=0
  for label in ['pos', 'neg']:
    label_direc = os.path.join(direc, label)
    for filename in os.listdir(label_direc):
      if (label == 'pos'):
        data += read_raw(os.path.join(label_direc,str(filename)), 1)
      else:
        data += read_raw(os.path.join(label_direc,str(filename)), 0)
      counter += 1
      print(counter)
  print('Done')
  return data

In [0]:
data = build_data('./drive/My Drive/txt_sentoken/')

# Data to File

In [0]:
data = np.array(data)
file = open('data', 'wb')
pickle.dump(data, file)
file.close()

# Movie Review Dataset

In [0]:
class MovieReviewDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.data = dataset
        self.transform = transform
    
    def __getitem__(self, idx):
        temp = self.data[idx]
        if (self.transform):
            temp = self.transform(sample)
        
        sample = {'sentence': temp[0] ,'label': temp[1]}
        return sample
    
    def __len__(self):
        return len(self.data)            

# Load data file

In [12]:
file = open('data', 'rb')
data = pickle.load(file)
file.close()

X_train, X_test, y_train, y_test = train_test_split(data[:,0], data[:,1])
dataset = []
for i in range(len(X_train)):
    dataset.append([X_train[i], y_train[i]])
    
mr = MovieReviewDataset(dataset)
print(len(mr))


mr_dataloader = DataLoader(mr, batch_size = 16, shuffle=True)

48540


In [13]:
print(len(next(iter(mr_dataloader))['label']))

16


In [0]:
class CRAN(nn.module):
  def __init__(self,embedding_size,cnn_num_filters,cnn_window_length,LSTM_hidden_units,dropout_p):
    super(CRAN,self).__init__()

    """
    inputs:
      cnn_num_filters: the number of convolutional kernels, represents the number of output channels parameter in Conv2d
      cnn_window_length: in a kernel size of d*l, d represents the embedding size and l represents the window length
      LSTM_hidden_units: number of hidden units in LSTM layer
      dropout: dropout probability for CNN
      embedding_size: length of embedded word vectors 
    """
    self.cnn=torch.nn.Conv2d(1,cnn_num_filters,(cnn_window_length,embedding_size),stride=(1,0))
    self.dropout=torch.nn.Dropout(p_dropout)
    self.LSTM=torch.nn.LSTM(embedding_size,LSTM_hidden_units,1)
  def forward(self,batch,labels):
    (N,T,d)=batch.shape
    #apply convolutional filters to the input sentences
    cnn_output=self.cnn(batch)
    #cnn_output will be of shape (N,cnn_num_filters,H_out from Pytorch documentation,1)
    shape=cnn_output.shape
    cnn_output=cnn_output.view(N,shape[1],shape[2])
    #average across the different filter outputs
    cnn_output=torch.mean(cnn_output,1)
    batch_for_LSTM=batch.permute(1,0,2)
    




In [18]:
a=torch.randn(2,4,4)
print(a)
print(torch.mean(a,1))

tensor([[[-1.2665,  0.3923, -1.1571,  1.8619],
         [-0.1448,  0.0378, -0.5983,  1.1201],
         [ 0.2034,  0.9530,  1.3362, -0.9622],
         [-1.5275, -0.8303, -0.4121,  0.0396]],

        [[-0.6037,  0.1255,  1.1212, -0.4120],
         [ 0.5883,  0.6299,  2.3593, -1.4743],
         [-1.0952,  1.0461, -2.1049, -1.2259],
         [ 1.6651,  1.6533, -0.2928,  3.5429]]])
tensor([[-0.6838,  0.1382, -0.2078,  0.5148],
        [ 0.1386,  0.8637,  0.2707,  0.1077]])
