## Import libraries

In this section we will import all the libraries needed for our exercise

In [1]:
#@title Libraries
#@title Libraries
import pandas as pd
from pandas import DataFrame
import numpy as np
import dateutil
from dateutil.parser import parse
import datetime
from datetime import date, timedelta


#for text pre-processing
import re, string, os, nltk
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem.snowball import SpanishStemmer

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (classification_report,
                             f1_score,
                             accuracy_score,
                             confusion_matrix)
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec

!pip install datasets --quiet

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pierredelice/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pierredelice/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pierredelice/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pierredelice/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Helper functions
This helper function describes all modules needed for internal operations

In [5]:
#@title Functions
#convert to lowercase, strip and remove punctuations

RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')

def impurity(text, min_len=10):
  """returns the share of suspicious characters in a text"""
  if text == None or len(text) < min_len:
    return 0
  else:
    return len(RE_SUSPICIOUS.findall(text))/len(text)


def preprocess(text):
    text = text.lower()
    #text = re.sub(r'\b\w{1,3}\b','',text)
    text = text.strip()
    text = re.sub(r'[^\w\s]', '', str(text))
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]',' ',text)
    text = re.sub(r'\d',' ',text)
    text = re.sub(r'\s+',' ',text)
    text = r' '.join([w for w in text.split() if len(w)>2])
    return text

# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words(['english','spanish'])]
    return ' '.join(a)


#LEMMATIZATION
# Initialize the lemmatizer
#from nltk.stem import SnowballStemmer
#wl = SnowballStemmer('spanish')
wl = WordNetLemmatizer()

# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(
        word_tokenize(string)) # Get position tags
    #print('Pos tag printed: \n',word_pos_tags)
    a = [
        wl.lemmatize(
            tag[0], get_wordnet_pos(tag[1])
            ) for idx, tag in enumerate(word_pos_tags)
        ] # Map the position tag and lemmatize the word/token
    #print('lemmatizer: \n',a)
    return " ".join(a)#, print('Pos tag printed: \n',word_pos_tags),print('Lemmatizer: \n',a)

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

def process(string):
  return stopword(preprocess(string))

## Download dataset

In [3]:
from datasets import list_datasets
from datasets import Dataset
from datasets import load_dataset # hugging face datasets

dataset = load_dataset("Numind/C4_sentiment-analysis") # Note: this is already a dataset

Downloading readme:   0%|          | 0.00/468 [00:00<?, ?B/s]

Downloading and preparing dataset csv/Numind--C4_sentiment-analysis to /Users/pierredelice/.cache/huggingface/datasets/Numind___csv/Numind--C4_sentiment-analysis-45b98c2ee260be74/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/47.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.67M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating balanced split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/pierredelice/.cache/huggingface/datasets/Numind___csv/Numind--C4_sentiment-analysis-45b98c2ee260be74/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# To DataFrame
data = pd.concat(
    [DataFrame(dataset['train'].data['text']).rename(columns={0:'text'}),
     DataFrame(dataset['train'].data['label']).rename(columns={0:'label'})],
    axis=1)

#Calculate impurity
data['impurity'] = data.text.apply(impurity, min_len=3)

#data = data[:30000]

In [7]:
from tqdm import tqdm
tqdm.pandas()
data.text = data.text.progress_apply(lambda x: finalpreprocess(x))

100%|██████████| 254106/254106 [32:47<00:00, 129.14it/s] 


In [8]:
#Make a copy of the data
data_clean = data.copy(deep=False)

In [9]:
import torch
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn
import collections

In [24]:
X,y = data_clean['text'].values,data_clean['label'].values
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y)

print(f'train data shape: {x_train.shape}')
print(f'test data shape: {x_test.shape}')
print("---------------------------------")
X[:5], y[:5]

train data shape: (190579,)
test data shape: (63527,)
---------------------------------


(array(['client one uk lead support service construction company employ around people annual revenue around',
        'electrically charge flipped golf cart pro mod drag strip',
        'book air india express airline flight ticket bangalore graz get unbelievable discount yatracom compare airline price get low airfare air india express airline bangalore graz',
        'provide high quality embroidery digitize vector artwork cheap rate customize official personal need charge stitch fast turnaround time',
        'part effort support sport industry alshiaka launch show game digital competition whereby participant ask bounce soccer ball head order win prize'],
       dtype=object),
 array([2, 2, 2, 0, 2]))

In [25]:
def preprocess_string(s):
    # Remove all non-word characters (everything except numbers and letters)
    #s = re.sub(r"[^\w\s]", '', s)
    # Replace all runs of whitespaces with no space
    #s = re.sub(r"\s+", '', s)
    # replace digits with no space
    s = re.sub(r"\d", '', s)
    return s

def tokenize(x_train,y_train,x_val,y_val):
    word_list = []


    stop_words = set(stopwords.words('english'))
    for sent in x_train:
        for word in sent.split():
          #word = preprocess_string(word)
          #if word not in stop_words and word != '':
            word_list.append(word)

    corpus = collections.Counter(word_list)
    # sorting on the basis of most common words
    corpus_ = sorted(corpus,key=corpus.get,reverse=True)[:1000]
    # creating a dict
    onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}

    # tokenize
    final_list_train,final_list_test = [],[]
    for sent in x_train:
            final_list_train.append(
                 [onehot_dict[word] for word in sent.lower().split() if word in onehot_dict.keys()])
    for sent in x_val:
            final_list_test.append(
                 [onehot_dict[word] for word in sent.lower().split() if word in onehot_dict.keys()])

    #encoded_train = [1 if label =='positive' else 0 for label in y_train]
    #encoded_test = [1 if label =='positive' else 0 for label in y_val]
    encoded_train = y_train
    encoded_test = y_val

    return np.array(final_list_train,dtype=object), np.array(encoded_train,dtype=object),np.array(final_list_test,dtype=object), np.array(encoded_test,dtype=object),onehot_dict


x_train,y_train,x_test,y_test,vocab = tokenize(x_train,y_train,x_test,y_test)



In [62]:
d = []
for i in list(x_train):
  a = len(i)
  d.append(a)

In [71]:
for i in d:
    if d[i] == 1:
        print(d)

In [40]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features


x_train_pad = padding_(x_train,300)
x_test_pad = padding_(x_test,300)

In [41]:
x_train_pad, x_test_pad, x_test_pad[-1]

(array([[  0,   0,   0, ..., 551, 472, 571],
        [  0,   0,   0, ..., 313, 106, 172],
        [  0,   0,   0, ..., 428, 442, 424],
        ...,
        [  0,   0,   0, ..., 256, 497, 123],
        [  0,   0,   0, ..., 144,  18,   9],
        [  0,   0,   0, ..., 148, 257, 945]]),
 array([[  0,   0,   0, ..., 303, 349, 162],
        [  0,   0,   0, ..., 250, 471, 281],
        [  0,   0,   0, ..., 181, 848, 477],
        ...,
        [  0,   0,   0, ..., 784,  10,   3],
        [  0,   0,   0, ..., 140, 378, 101],
        [  0,   0,   0, ...,   0,   0, 460]]),
 array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [42]:
X

array(['client one uk lead support service construction company employ around people annual revenue around',
       'electrically charge flipped golf cart pro mod drag strip',
       'book air india express airline flight ticket bangalore graz get unbelievable discount yatracom compare airline price get low airfare air india express airline bangalore graz',
       ...,
       'quality nice rock collection durable print piece make last packaged grab easygrip handle carry case make usa quality product full money back guarantee',
       'customer demand precise information clothes want know produce numerous scandal work condition fashion industry lead rise collective awareness issue brand choice adapt new expectation',
       'bizjet connectivity big business'], dtype=object)

In [None]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))


# dataloaders
batch_size = 200


# make sure to SHUFFLE your data
train_loader = DataLoader(
    train_data, shuffle=True,
    batch_size=batch_size,
    drop_last=True)
valid_loader = DataLoader(
    valid_data,
    shuffle=True,
    batch_size=batch_size,
    drop_last=True)


# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)


print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample output: \n', sample_y)

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim

        self.no_layers = no_layers
        self.vocab_size = vocab_size

        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        #lstm
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            num_layers=no_layers,
            hidden_size=self.hidden_dim,
            batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # sigmoid function
        sig_out = self.sig(out)

        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)


        sig_out = sig_out[:, -1] # get last batch of labels

        # return last sigmoid output and hidden state
        return sig_out, hidden

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden

In [None]:
no_layers = 2
vocab_size = len(vocab) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 225 #630//2
device = torch.device('cuda:0')

model = SentimentRNN(
    no_layers,
    vocab_size,
    hidden_dim,
    embedding_dim,
    drop_prob=0.5)


#moving to gpu
model.to(device)
print(model)

In [None]:
# loss and optimization functions


lr=0.001


criterion = nn.CrossEntropyLoss()


optimizer = torch.optim.Adam(model.parameters(), lr=lr)


# function to predict accuracy
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()


In [None]:
clip = 5
epochs = 5
valid_loss_min = np.Inf
# train for some number of epochs
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]


for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:

        inputs, labels = inputs.to(device), labels.to(device)
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        model.zero_grad()
        output,h = model(inputs,h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        # calculating accuracy
        accuracy = acc(output,labels)
        train_acc += accuracy
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    model.eval()
    for inputs, labels in valid_loader:
            val_h = tuple([each.data for each in val_h])


            inputs, labels = inputs.to(device), labels.to(device)


            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())


            val_losses.append(val_loss.item())

            accuracy = acc(output,labels)
            val_acc += accuracy

    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(valid_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f'Epoch {epoch+1}')
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    if epoch_val_loss <= valid_loss_min:
        torch.save(model.state_dict(), 'state_dict.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,epoch_val_loss))
        valid_loss_min = epoch_val_loss
    print(25*'==')