In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/NLP/

In [None]:
import pandas as pd

INPUT_PATH = './yelp_academic_dataset_review.json'
DATASET_CSV_PATH = 'yelp_reviews_gen.csv'
PATH_TO_YELP_REVIEWS = INPUT_PATH

In [None]:
# Downlaoding the CSV files -- sharing the CSV between the CNN and LSTM notebooks
! gdown --id 1tO13aN1nnJqupL-JkydxBg7H5ET4y0HM

In [None]:
top_data_df = pd.read_csv(DATASET_CSV_PATH)
print("Columns in the original dataset:\n")
print(top_data_df.columns)

In [None]:
from os import statvfs_result
import matplotlib.pyplot as plt 

print("Number of rows per star rating:")
print(top_data_df['stars'].value_counts())

# Function to assign sentiment
def get_sentiment(stars, bi=True, tri=False):
    # assert stars >= 1 and stars <= 5 and type(stars) == int
    if bi:
        if stars <=2:
            return 0
        else:
            return 1
    elif tri:
        if stars <= 2:
            return -1
        elif stars == 3:
            return 0
        else:
            return 1
    else:
        return int(stars)

# Function to map stars to sentiment
# def map_sentiment(stars_received):
#     if stars_received <= 2:
#         return -1
#     elif stars_received == 3:
#         return 0
#     else:
#         return 1
# Mapping stars to sentiment into three categories
top_data_df['sentiment'] = [ get_sentiment(x, bi=True) for x in top_data_df['stars']]
# Plotting the sentiment distribution
plt.figure()
pd.value_counts(top_data_df['sentiment']).plot.bar(title="Sentiment distribution in df")
plt.xlabel("Sentiment")
plt.ylabel("No. of rows in df")
plt.show()

In [None]:
# Function to retrieve top few number of each category
def get_top_data(top_n = 5000):
    data_positive_df = top_data_df[top_data_df['sentiment'] == 1].head(top_n)
    data_negative_df = top_data_df[top_data_df['sentiment'] == 0].head(top_n)

    final_data_df = pd.concat([data_positive_df, data_negative_df])
    return final_data_df

    # top_data_df_positive = top_data_df[top_data_df['sentiment'] == 1].head(top_n)
    # top_data_df_negative = top_data_df[top_data_df['sentiment'] == -1].head(top_n)
    # top_data_df_neutral = top_data_df[top_data_df['sentiment'] == 0].head(top_n)
    # top_data_df_small = pd.concat([top_data_df_positive, top_data_df_negative, top_data_df_neutral])
    # return top_data_df_small

# Function call to get the top 10000 from each sentiment
reduced_dataset = get_top_data(top_n=10000)

# After selecting top few samples of each sentiment
print("After segregating and taking equal number of rows for each sentiment:")
print(reduced_dataset['sentiment'].value_counts())
reduced_dataset.head(10)

In [None]:
# Sample - Removing the stop words
from gensim.parsing.preprocessing import remove_stopwords
print(remove_stopwords("Restaurant had a really good service!!"))

In [None]:
# Tokenize the text column to get the new column 'tokenized_text'
from gensim.utils import simple_preprocess
reduced_dataset['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in reduced_dataset['text']] 
print(reduced_dataset['tokenized_text'].head(10))

In [None]:
from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# Get the stemmed_tokens
reduced_dataset['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in reduced_dataset['tokenized_text'] ]
reduced_dataset['stemmed_tokens'].head(10)

In [None]:
from sklearn.model_selection import train_test_split
# Train Test Split Function
def split_train_test(top_data_df_small, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split(reduced_dataset[['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text', 'useful', 'user_id', 'stemmed_tokens']], 
                                                        reduced_dataset['sentiment'], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

# Call the train_test_split
X_train, X_test, Y_train, Y_test = split_train_test(reduced_dataset)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch
# Use cuda if present
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device available for running: {device}")

In [None]:
from gensim.models import Word2Vec
size = 500
window = 3
min_count = 1
workers = 3
sg = 1
OUTPUT_FOLDER = 'OpData'
# Function to train word2vec model
def make_word2vec_model(reduced_dataset, padding=True, sg=1, min_count=1, size=500, workers=3, window=3):
    if  padding:
        print(len(reduced_dataset))
        temp_df = pd.Series(reduced_dataset['stemmed_tokens']).values
        temp_df = list(temp_df)
        temp_df.append(['pad'])
        word2vec_file = OUTPUT_FOLDER + '/models/'+'word2vec_' + str(size) + '_PAD.model'
    else:
        temp_df = reduced_dataset['stemmed_tokens']
        word2vec_file = OUTPUT_FOLDER + '/models/' + 'word2vec_' + str(size) + '.model'
    w2v_model = Word2Vec(temp_df, min_count = min_count, vector_size=size, workers = workers, window = window, sg = sg)

    # w2v_model.save(word2vec_file)
    return w2v_model, word2vec_file

# Train Word2vec model
w2vmodel, word2vec_file = make_word2vec_model(reduced_dataset, padding=True, sg=sg, min_count=min_count, size=size, workers=workers, window=window)

In [None]:
max_sen_len = reduced_dataset.stemmed_tokens.map(len).max()
padding_idx = w2vmodel.wv.key_to_index['pad']
def make_word2vec_vector_cnn(sentence):
    padded_X = [padding_idx for i in range(max_sen_len)]
    i = 0
    for word in sentence:
        if word not in w2vmodel.wv.key_to_index:
            padded_X[i] = 0
            print(word)
        else:
            padded_X[i] = w2vmodel.wv.key_to_index[word]
        i += 1
    return torch.tensor(padded_X, dtype=torch.long, device=device).view(1, -1)

In [None]:
# Function to get the output tensor
def make_target(label):
    if label == -1:
        return torch.tensor([0], dtype=torch.long, device=device)
    elif label == 0:
        return torch.tensor([1], dtype=torch.long, device=device)
    else:
        return torch.tensor([2], dtype=torch.long, device=device)

In [None]:
import gensim
EMBEDDING_SIZE = 500
NUM_FILTERS = 10

class LSTMSentimentClassifier(nn.Module):
    def __init__(self, no_layers, output_dim, hidden_dim, embedding_dim, drop_prob=0.5):
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        w2vmodel = gensim.models.KeyedVectors.load(OUTPUT_FOLDER + 'models/' + 'word2vec_500_PAD.model')
        weights = w2vmodel.wv

        # With pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights.vectors), padding_idx=w2vmodel.wv.key_to_index['pad'])
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_dim,
                            num_layers=no_layers, batch_first=True)
        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
    
    def forward(self, x, hidden):
      batch_size = x.size(0)
      embeds = self.embedding(x)

      lstm_out, hidden = self.lstm(embeds, hidden)
      lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

      # dropout and fully connected layer
      out = self.dropout(lstm_out)
      out = self.fc(out)

      # sigmoid function
      sig_out = self.sig(out)

      # reshape to be batch_size first
      sig_out = sig_out.view(batch_size, -1)

      sig_out = sig_out[:, -1]  # get last batch of labels

      # return last sigmoid output and hidden state
      return sig_out, hidden
    
    def init_hidden(self, batch_size, device):
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers, batch_size, self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers, batch_size, self.hidden_dim)).to(device)
        hidden = (h0, c0)
        return hidden


In [None]:
# Create data loaders for training & validation sets
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))
batch_size = 50

# data shuffling
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [None]:
# == sample params == 
clipping_quotient = 5
num_epochs = 50
valid_loss_min = np.Inf
layers_count = 2
embedding_dimensions = 500
output_dimensions = 1
hidden_dimensions = 256

In [None]:
# Classes = 2 ['positive', 'negative']
learning_rate=0.001
loss_func = nn.BCELoss()
model = LSTMSentimentClassifier(layers_count,output_dimensions,hidden_dimensions,embedding_dimensions)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


epoch_tr_loss = list()
epoch_vl_loss = list()

for epoch in range(num_epochs):
    tr_losses = []
    train_acc = 0.0
    print("Epoch" + str(epoch + 1))
    model.train()

    h = model.init_hidden(batch_size, device)

    # training data
    for inputs, labels in train_loader:
        if device == 'cuda':
          inputs, labels = inputs.to(device), labels.to(device)
        h = tuple([each.data for each in h])
        output, h = model(inputs, h)
        loss_gradient = loss_func(output.squeeze(), labels.float())
        loss_gradient.backward()
        tr_losses += loss_gradient.item()
        nn.utils.clip_grad_norm_(model.parameters(), clipping_quotient)
        optimizer.step()
    
    # validation data
    h = model.init_hidden(batch_size, device)
    v_losses = []
    v_acc = 0.0
    model.eval()
    for inputs, labels in valid_loader:
      v_h = tuple([each.data for each in h])
      if device == 'cuda':
          inputs, labels = inputs.to(device), labels.to(device)
          output, v_h = model(inputs, v_h)
          val_loss = loss_func(output.squeeze(), labels.float())
          v_losses += val_loss.item()
    
    epoch_tr_loss += np.mean(tr_losses)
    epoch_vl_loss += np.mean(vl_losses)
    print(f'train_loss : {np.mean(tr_losses)} val_loss : {np.mean(vl_losses)}'))

# model_path = OUTPUT_FOLDER + 'models/sentiment_model_30ep.pt'
torch.save(model, OUTPUT_FOLDER +'/' +'lstm_500_with_padding.pt')