In [6]:
import csv
import tensorflow as tf
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

print(tf.__version__)

2.4.1


In [2]:
## Hyper parameters

vocab_size = 5000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [3]:
articles = []
labels = []

with open("cleaned.csv", 'r',encoding='utf8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)
print(len(labels))
print(len(articles))

167684
167684


In [4]:
def preprocess_text(sen):
    
    # Removing html tags
    sentence = str(remove_tags(sen))
    # Remove punctuations and numbers

    sentence = str(re.sub('[^a-zA-Z]', ' ', sentence))
    

    # Single character removal
    sentence = str(re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence))

    # Removing multiple spaces
    sentence = str(re.sub(r'\s+', ' ', sentence))

    sentence = str(sentence.lower())


    return sentence

In [7]:
TAG_RE = re.compile(r'<[^>]+>') ##

def remove_tags(text):
    return TAG_RE.sub('', text)

In [10]:
X_articles = []
for sen in articles:
    X_articles.append(preprocess_text(sen))

In [11]:
Y_labels = []
for sen in labels:
    Y_labels.append(preprocess_text(sen))

In [13]:
X_articles[1192]

'great service amazing support '

In [12]:

Y_labels[11192]

'good'

In [14]:
train_size = int(len(X_articles) * training_portion)

train_articles = X_articles[0: train_size]
train_labels = Y_labels[0: train_size]

validation_articles = X_articles[train_size:]
validation_labels = Y_labels[train_size:]

print(train_size)
print(len(train_articles))
print(len(train_labels))
print(len(validation_articles))
print(len(validation_labels))

134147
134147
134147
33537
33537


In [5]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'i': 2,
 'service': 3,
 'call': 4,
 'account': 5,
 'the': 6,
 'get': 7,
 'told': 8,
 'back': 9,
 'dstv': 10}

In [6]:
train_sequences = tokenizer.texts_to_sequences(train_articles)
print(train_sequences[10])

[22, 126, 1, 721, 156, 694, 116, 1410, 246, 3062]


In [7]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

10
200
114
200
10
200


In [8]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

33537
(33537, 200)


In [9]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))
print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)

print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)

[3]
[2]
[3]
(134147, 1)
[4]
[4]
[2]
(33537, 1)


In [18]:
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.LSTM(128),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          320000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 390       
Total params: 427,462
Trainable params: 427,462
Non-trainable params: 0
_________________________________________________________________


In [28]:
print(set(labels))

{'neutral', 'worst', 'great', 'good', 'bad'}


In [17]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
history = model.fit(train_padded, training_label_seq, batch_size=128, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Epoch 1/10
1049/1049 - 479s - loss: 0.2625 - accuracy: 0.9047 - val_loss: 2.6471 - val_accuracy: 0.3460
Epoch 2/10
1049/1049 - 417s - loss: 0.2352 - accuracy: 0.9169 - val_loss: 2.8126 - val_accuracy: 0.3518
Epoch 3/10
1049/1049 - 451s - loss: 0.2170 - accuracy: 0.9254 - val_loss: 2.8001 - val_accuracy: 0.3719
Epoch 4/10
1049/1049 - 401s - loss: 0.2001 - accuracy: 0.9328 - val_loss: 3.2743 - val_accuracy: 0.3367
Epoch 5/10
1049/1049 - 488s - loss: 0.1863 - accuracy: 0.9386 - val_loss: 3.1098 - val_accuracy: 0.3689
Epoch 6/10
1049/1049 - 542s - loss: 0.1747 - accuracy: 0.9439 - val_loss: 3.2709 - val_accuracy: 0.3456
Epoch 7/10
1049/1049 - 396s - loss: 0.1646 - accuracy: 0.9480 - val_loss: 3.2616 - val_accuracy: 0.3763
Epoch 8/10
1049/1049 - 418s - loss: 0.1566 - accuracy: 0.9511 - val_loss: 3.3759 - val_accuracy: 0.3725
Epoch 9/10
1049/1049 - 462s - loss: 0.1467 - accuracy: 0.9545 - val_loss: 3.6449 - val_accuracy: 0.3586
Epoch 10/10
1049/1049 - 552s - loss: 0.1419 - accuracy: 0.9562 -

In [16]:
txt = [""]

seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq,max_length)
pred = model.predict(padded)
rating = ['neutral', 'worst', 'great', 'good', 'bad']

print(pred)
print(np.argmax(pred))
print(rating[np.argmax(pred)-1])

[[5.2056485e-08 6.6332418e-01 1.9508837e-01 8.4212855e-02 1.2742748e-02
  4.4631843e-02]]
1
neutral
