In [24]:
import pandas as pd 
import numpy as np 
import gensim 
import tensorflow as tf
import re 

### Data preprocessing

We read the input IITB corpus into a list of sentences 

In [25]:
from datasets import load_dataset
corpus_data = load_dataset('cfilt/iitb-english-hindi')
translation_data = corpus_data["train"]["translation"]

Using custom data configuration cfilt--iitb-english-hindi-930ee63dc3ad2bff
Reusing dataset parquet (C:\Users\parth\.cache\huggingface\datasets\parquet\cfilt--iitb-english-hindi-930ee63dc3ad2bff\0.0.0\0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)
100%|██████████| 3/3 [00:00<00:00, 11.60it/s]


In [26]:
data = [list_element['en'] for list_element in translation_data]
iitb_data = data

In [27]:
train_data = pd.read_csv('training_data.csv')
X_train = train_data.loc[:, 'text']
y_train = train_data.loc[:, 'airline_sentiment']
split_index = int(len(X_train) * 0.8)
X_test = X_train[split_index:]
y_test = y_train[split_index:]
X_train = X_train[:split_index]
y_train = y_train[:split_index]

Now we remove any unwanted details in the dataset. These details include URLs, Emails, New line characters etc. 

In [28]:
def depure_data(data):
    
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)
        
    return data

Moving on, we remove the longest and the shortest words in the dataset since they are mostly useless<br/>
For this, we use gensim library's simple_preprocess function which ignores tokens shorter than 2 and longer than 15 letter in length

In [29]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

Detokenization of all sentences is needed again because we will feed a proper sentence with context to the model as the input for proper sentiment analysis

In [30]:
from nltk.tokenize import TreebankWordDetokenizer

def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

In [31]:
from nltk.stem import PorterStemmer 

stemmer = PorterStemmer()

def stem(words):
    stemmed_words = []
    for word in words: 
        stemmed_words.append(stemmer.stem(word))
    return stemmed_words

In [35]:
# tokenize the sentences 
temp = []
for sentence in data:
    temp.append(depure_data(sentence))
tokenized_sentences = list(sent_to_words(temp))

# stem the tokens 
temp = []
for tokenized_sentence in tokenized_sentences:
    temp.append(stem(tokenized_sentence))
tokenized_sentences = temp

# detokenize the stemmed tokens to form sentences with context 
# context is needed for further processing 
d = []
for data_word in tokenized_sentences: 
    d.append(detokenize(data_word))
print(d[:5])

data = d

['give your applic an access workout', 'accercis access explor', 'the default plugin layout for the bottom panel', 'the default plugin layout for the top panel', 'list of plugin that are disabl by default']


In [36]:
word_set = set()
for sentence in data: 
    sent_set = set(sentence.split())
    word_set.update(sent_set)
print(len(word_set))

151588


Now that we have preprocessed the data, we can move onto converting them into vectors 

In [37]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 200000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
data = pad_sequences(sequences, maxlen=max_len)
print(data.shape)

(1659083, 200)


In [39]:
temp = []
for sentence in X_train:
    temp.append(depure_data(sentence))
tokenized_sentences = list(sent_to_words(temp))
temp = []
for tokenized_sentence in tokenized_sentences:
    temp.append(stem(tokenized_sentence))
tokenized_sentences = temp
d = []
for data_word in tokenized_sentences: 
    d.append(detokenize(data_word))
X_train = d

temp = []
for sentence in X_test:
    temp.append(depure_data(sentence))
tokenized_sentences = list(sent_to_words(temp))
temp = []
for tokenized_sentence in tokenized_sentences:
    temp.append(stem(tokenized_sentence))
tokenized_sentences = temp
d = []
for data_word in tokenized_sentences: 
    d.append(detokenize(data_word))
X_test = d

With the corpus cleansed, we now proceed to the word embedding part of the preprocessing

In [40]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(sequences, maxlen=max_len)


tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_test)
sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences, maxlen=max_len)

We convert the training target label from categorical to numerical and encode them using one hot encoding

In [41]:
temp = []
for sentiment in y_train: 
    if sentiment == 'neutral':
        temp.append(0)
    if sentiment == 'negative':
        temp.append(1)
    if sentiment == 'positive':
        temp.append(2)
y_train = np.array(temp)


temp = []
for sentiment in y_test: 
    if sentiment == 'neutral':
        temp.append(0)
    if sentiment == 'negative':
        temp.append(1)
    if sentiment == 'positive':
        temp.append(2)
y_test = np.array(temp)

y_train = tf.keras.utils.to_categorical(y_train, 3, dtype="float32")
y_test = tf.keras.utils.to_categorical(y_test, 3, dtype="float32")

In [42]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(11712, 200)
(11712, 3)
(2928, 200)
(2928, 3)


In [43]:
from keras import layers
from keras.callbacks import ModelCheckpoint
from keras import Sequential

model = Sequential()
model.add(layers.Embedding(max_words, 20, name="embedding_1")) 
model.add(layers.LSTM(units=15, dropout=0.5, name="lstm_2")) 
model.add(layers.Dense(3, activation='softmax', name="dense_3"))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

checkpoint = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='auto', save_weights_only=False)
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test),callbacks=[checkpoint])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 20)          4000000   
                                                                 
 lstm_2 (LSTM)               (None, 15)                2160      
                                                                 
 dense_3 (Dense)             (None, 3)                 48        
                                                                 
Total params: 4,002,208
Trainable params: 4,002,208
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.69092, saving model to best_model1.hdf5
Epoch 2/100
Epoch 2: val_accuracy did not improve from 0.69092
Epoch 3/100
Epoch 3: val_accuracy did not improve from 0.69092
Epoch 4/100
Epoch 4: val_accuracy did not improve from 0.69092
Epoch 5/100
Epoc

In [44]:
from keras.models import load_model

best_model = load_model('best_model1.hdf5')
predictions = best_model.predict(data)

In [45]:
labels = ['neutral', 'positive', 'negative']
pred_labels = []

for prediction in predictions: 
    pred_labels.append(labels[prediction.argmax()])

In [46]:
for i in range(30):
    print("sentence: ", iitb_data[i])
    print("sentiment: ", pred_labels[i])
    print("="*10)

sentence:  Give your application an accessibility workout
sentiment:  positive
sentence:  Accerciser Accessibility Explorer
sentiment:  negative
sentence:  The default plugin layout for the bottom panel
sentiment:  negative
sentence:  The default plugin layout for the top panel
sentiment:  neutral
sentence:  A list of plugins that are disabled by default
sentiment:  positive
sentence:  Highlight duration
sentiment:  negative
sentence:  The duration of the highlight box when selecting accessible nodes
sentiment:  neutral
sentence:  Highlight border color
sentiment:  neutral
sentence:  The color and opacity of the highlight border.
sentiment:  negative
sentence:  Highlight fill color
sentiment:  neutral
sentence:  The color and opacity of the highlight fill.
sentiment:  negative
sentence:  API Browser
sentiment:  negative
sentence:  Browse the various methods of the current accessible
sentiment:  negative
sentence:  Hide private attributes
sentiment:  negative
sentence:  Method
sentiment