In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
df = pd.read_csv('/content/data.csv')
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [None]:
# Shuffling the dataset

df_shuffled = df.sample(frac=1, random_state=42)
df_shuffled.head()

Unnamed: 0,Sentence,Sentiment
4584,A few employees would remain at the Oulu plant...,neutral
177,Comparable net sales are expected to increase ...,positive
167,"Tesla is recalling 2,700 Model X cars: https:/...",negative
5585,Finnish software developer Done Solutions Oyj ...,positive
2339,Compagnie de Financement Foncier - Is to issue...,neutral


In [None]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_shuffled['Sentence'].to_numpy(),
                                                                            df_shuffled['Sentiment'].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [None]:
train_labels_numerical = np.where(train_labels == 'positive', 1, 0)
val_labels_numerical = np.where(val_labels == 'positive', 1, 0)

In [None]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

21

In [None]:
from tensorflow.keras.layers import TextVectorization

text_vectorizer = TextVectorization(max_tokens=10000,
                                    standardize='lower_and_strip_punctuation',
                                    split='whitespace',
                                    ngrams=None,
                                    output_mode = 'int',
                                    output_sequence_length=21,
                                    pad_to_max_tokens = True)


In [None]:
text_vectorizer.adapt(train_sentences)

### Creating an Embedding Layer

To make our embedding , we're going to use ttensorflow embedding layer

The parameters we care most about for our embedding layer:

* `input_dim` = the size of our vocalbulary

* `output_dim` = the size of the output embedding vector

* `input_length ` = length of sequences being passed to the embedding layer



In [None]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=10000,
                              output_dim=256,
                             embeddings_initializer='uniform',
                              input_length=21)



In [None]:
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

sample_embed = embedding(text_vectorizer([random_sentence]))
print(sample_embed)

Original text:
In Finland , the city of Forssa has said it will not pay compensation to food industry companies HK Ruokatalo and Atria for the lye leak into tap water that occurred in March 2008 .      

Embedded version:
tf.Tensor(
[[[-0.0018602   0.01311136 -0.00897826 ...  0.03444784 -0.04165038
   -0.00309082]
  [-0.01605984 -0.03618829 -0.04548918 ...  0.00471977  0.02546114
    0.02999422]
  [-0.00491785  0.02743859  0.00311438 ...  0.01849278 -0.01405418
   -0.01944168]
  ...
  [-0.00337516  0.0351171   0.04791019 ... -0.02901446  0.02557078
    0.00139468]
  [-0.04294826  0.04457084  0.03026109 ...  0.01598178 -0.02568207
   -0.00189756]
  [ 0.03799428  0.00686067  0.04671681 ... -0.01746447 -0.00741916
    0.0153729 ]]], shape=(1, 21, 256), dtype=float32)


## Modelling

* Model 0 : Naive Bayes ( baseline )
* Model 1 : Feed-Forward Neural Network
* Model 2 : LSTM Model
* Model 3 : GRU Model
* Model 4 : Bi-directional LSTM
* Model 5 : 1D Convolution Neural Network
* Model 6 : Tensorflow Hub Pretrained Feature Extractor( using transer learning fro NLP )

* Model 7 : Same as model 6 but with 10% of training data

># MODEL 0

In [None]:
# Model 0

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenizartion

model_0 = Pipeline([
                    ('tfidf', TfidfVectorizer()),
                    ('clf', MultinomialNB())
])

# Fit the pipeline to the training data

model_0.fit(train_sentences, train_labels_numerical)

In [None]:
# Evaluation

baseline_score = model_0.score(val_sentences,val_labels_numerical)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 69.91%


In [None]:
# Make predictions

baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
  model_results = {'accuracy': model_accuracy,
                  'precision': model_precision,
                  'recall': model_recall,
                  'f1': model_f1}
  return model_results

In [None]:
# Get baseline Results

baseline_results = calculate_results(y_true=val_labels_numerical,
                                   y_pred=baseline_preds)
baseline_results


{'accuracy': 69.91452991452991,
 'precision': 0.7171756420375205,
 'recall': 0.6991452991452991,
 'f1': 0.6266004962779156}

> # MODEL 1

In [None]:
from tensorflow.keras import layers
input = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(input)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_1 = tf.keras.Model(input, outputs, name='model_1_dense')

In [None]:
model_1.summary()

In [None]:
# Compile the model
model_1.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
# Fit the model

model_1_history = model_1.fit(train_sentences,
                             train_labels_numerical,
                             epochs=5,
                             validation_data=(val_sentences, val_labels_numerical))

Epoch 1/5
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 44ms/step - accuracy: 0.6673 - loss: 0.6383 - val_accuracy: 0.6923 - val_loss: 0.5892
Epoch 2/5
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.7445 - loss: 0.5173 - val_accuracy: 0.7607 - val_loss: 0.5200
Epoch 3/5
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step - accuracy: 0.8313 - loss: 0.4222 - val_accuracy: 0.8034 - val_loss: 0.4687
Epoch 4/5
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.8878 - loss: 0.3185 - val_accuracy: 0.8376 - val_loss: 0.4183
Epoch 5/5
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 33ms/step - accuracy: 0.9225 - loss: 0.2361 - val_accuracy: 0.8479 - val_loss: 0.4009


In [None]:
model_1.evaluate(val_sentences, val_labels_numerical)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8650 - loss: 0.3542 


[0.4008576273918152, 0.8478632569313049]

In [None]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [None]:
# Calculate results

model_1_results = calculate_results(y_true=val_labels_numerical,y_pred=model_1_preds)
model_1_results

{'accuracy': 84.78632478632478,
 'precision': 0.8460941055985435,
 'recall': 0.8478632478632478,
 'f1': 0.8445995719781693}

> # MODEL 2

In [None]:
# LSTM = long short term memory



In [None]:
# Create  an LSTM Model

from tensorflow.keras import layers
input = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(input)
x = embedding(x)
#x = layers.LSTM(64,return_sequences=True)(x)
x = layers.LSTM(64)(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_2 = tf.keras.Model(input, outputs, name='model_2_LSTM')


In [None]:
model_2.summary()


In [None]:
# Compile the model
model_2.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
# Fit the model

model_2_history = model_2.fit(train_sentences,
                             train_labels_numerical,
                             epochs=15,
                             validation_data=(val_sentences, val_labels_numerical))

Epoch 1/15
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 1.0000 - loss: 2.8083e-05 - val_accuracy: 0.8205 - val_loss: 1.8422
Epoch 2/15
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 50ms/step - accuracy: 1.0000 - loss: 1.2325e-05 - val_accuracy: 0.8222 - val_loss: 1.8836
Epoch 3/15
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 34ms/step - accuracy: 1.0000 - loss: 1.0975e-05 - val_accuracy: 0.8188 - val_loss: 1.9165
Epoch 4/15
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - accuracy: 1.0000 - loss: 8.9865e-06 - val_accuracy: 0.8205 - val_loss: 1.9510
Epoch 5/15
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 38ms/step - accuracy: 1.0000 - loss: 9.0284e-06 - val_accuracy: 0.8205 - val_loss: 1.9846
Epoch 6/15
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 32ms/step - accuracy: 1.0000 - loss: 7.3833e-06 - val_accuracy: 0.8205 - val_loss: 2.012

In [None]:
# Make preditions

model_2_pred_probs = model_2.predict(val_sentences)
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:20]

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [None]:
# Calculate model_2 results

model_2_results = calculate_results(y_true=val_labels_numerical,y_pred=model_2_preds)
model_2_results


{'accuracy': 81.88034188034187,
 'precision': 0.8157137483543351,
 'recall': 0.8188034188034188,
 'f1': 0.8159780804474863}

># MODEL 3

In [None]:
# Model 3 : GRU

from tensorflow.keras import layers
input = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(input)
x = embedding(x)
x = layers.GRU(64)(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_3 = tf.keras.Model(input, outputs, name='model_3_GRU')

In [None]:
model_3.summary()

In [None]:
# Compile the model

model_3.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
# Fit the model

model_3_history = model_3.fit(train_sentences,
                             train_labels_numerical,
                             epochs=10,
                             validation_data=(val_sentences, val_labels_numerical))

Epoch 1/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 37ms/step - accuracy: 1.0000 - loss: 3.7388e-05 - val_accuracy: 0.8359 - val_loss: 1.5419
Epoch 2/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 1.0000 - loss: 2.3397e-05 - val_accuracy: 0.8359 - val_loss: 1.6041
Epoch 3/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 1.0000 - loss: 1.5531e-05 - val_accuracy: 0.8342 - val_loss: 1.6484
Epoch 4/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 1.0000 - loss: 1.6342e-05 - val_accuracy: 0.8342 - val_loss: 1.6887
Epoch 5/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 39ms/step - accuracy: 1.0000 - loss: 1.4312e-05 - val_accuracy: 0.8342 - val_loss: 1.7265
Epoch 6/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 1.0000 - loss: 9.2083e-06 - val_accuracy: 0.8342 - val_loss: 1.764

In [None]:
model_3_pred_probs = model_3.predict(val_sentences)
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:20]

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [None]:
model_3_results = calculate_results(y_true=val_labels_numerical,y_pred=model_3_preds)
model_3_results

{'accuracy': 83.07692307692308,
 'precision': 0.8281386609402561,
 'recall': 0.8307692307692308,
 'f1': 0.8282658925979681}

> # MODEL 4

In [None]:
from tensorflow.keras import layers
input = layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(input)
x = embedding(x)
#x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_4 = tf.keras.Model(input, outputs, name='model_4_Bidirectional')

In [None]:
model_4.summary()

In [None]:
# Compile the model

model_4.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
# Fit the model

model_4_history = model_4.fit(train_sentences,
                             train_labels_numerical,
                             epochs=10,
                             validation_data=(val_sentences, val_labels_numerical))

Epoch 1/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 49ms/step - accuracy: 0.9276 - loss: 0.1798 - val_accuracy: 0.8171 - val_loss: 1.2413
Epoch 2/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 39ms/step - accuracy: 0.9992 - loss: 0.0029 - val_accuracy: 0.8325 - val_loss: 1.1089
Epoch 3/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 49ms/step - accuracy: 0.9996 - loss: 0.0015 - val_accuracy: 0.8085 - val_loss: 1.0645
Epoch 4/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 47ms/step - accuracy: 0.9971 - loss: 0.0089 - val_accuracy: 0.8171 - val_loss: 1.2753
Epoch 5/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 38ms/step - accuracy: 1.0000 - loss: 1.0298e-04 - val_accuracy: 0.8239 - val_loss: 1.4149
Epoch 6/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 49ms/step - accuracy: 1.0000 - loss: 5.5246e-05 - val_accuracy: 0.8239 - val_loss: 1.4826
Epoch 7/10


In [None]:
model_4_pred_probs = model_4.predict(val_sentences)
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:20]

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step


<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [None]:
model_4_results = calculate_results(y_true=val_labels_numerical,y_pred=model_4_preds)
model_4_results

{'accuracy': 82.22222222222221,
 'precision': 0.8196820549177869,
 'recall': 0.8222222222222222,
 'f1': 0.8202798134576949}