In [1]:
import tensorflow as tf
import pandas as pd
import string
import numpy as np
import nltk
from nltk import word_tokenize
from tensorflow import keras

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


# Load the Dataset

In [2]:
data = pd.read_csv('data.csv',encoding='utf8')

del data['Unnamed: 0'] # Removing the index column

for i in range(len(data["text"])):
    if data["text"][i] == ' ':
        data.drop(i, inplace = True )
data.head(10)      

Unnamed: 0,text,label
0,"GAZA/CAIRO (Reuters) - Palestinian factions, i...",1
1,HARARE (Reuters) - Zimbabwean police arrested ...,1
2,Ronna Romney McDaniel is the Chairman of the M...,0
3,WASHINGTON (Reuters) - A small group of Republ...,1
4,"THE HUNTINGTON BEACH, CA RALLY WAS PRETTY BIG ...",0
5,BALTIMORE (Reuters) - A Maryland state senator...,1
6,Remember when the left would have been ashamed...,0
7,CLEVELAND (Reuters) - As Republicans spilled i...,1
9,Beware of morning talk show hosts turned news...,0
10,Senator Bernie Sanders had the crowd roaring w...,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44271 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44271 non-null  object
 1   label   44271 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


# Data Preprocessing

### Spliting the data

In [4]:
# Training and Validation Split
validation_split = 0.2
num_validation_samples = int(validation_split * len(data))
train_samples = data["text"][:-num_validation_samples]
val_samples = data["text"][-num_validation_samples:]
train_labels = data["label"][:-num_validation_samples]
val_labels = data["label"][-num_validation_samples:]

### Word Vector Representation

In [16]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

Length_of_input_sentenses=500

# matrix of pretrained weights
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)
vectorizer

<tensorflow.python.keras.layers.preprocessing.text_vectorization.TextVectorization at 0x222184d1400>

In [17]:
# Customize get_vocabulary method
def _get_vocabulary():
    keys, values = vectorizer._index_lookup_layer._table_handler.data()
    return [x.decode('utf8', errors='ignore') for _, x in sorted(zip(values, keys))]

In [18]:
# vectorizer.get_vocabulary()
_get_vocabulary()[:5]

['', 'the', 'to', 'of', 'a']

In [19]:
output = vectorizer([["the cat and dog sat on the mat"]])
output.numpy()[0, :6]

array([   2, 8942,    6, 3408, 3268,    9], dtype=int64)

In [20]:
voc = _get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

embeddings_index = {}
with open('glove.6B.300d.txt', encoding="utf8") as f:
    for line in f:
        
        word, coefs = line.split(maxsplit=1)
        
        coefs = np.fromstring(coefs, "f", sep=" ")
        
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [21]:
num_tokens = len(voc) + 2
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18281 words (1716 misses)


# Embedding Layer

In [22]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)

# Model

In [23]:
from tensorflow.keras import layers

In [27]:
model = keras.Sequential()
model.add(embedding_layer)
model.add(layers.Dense(128, activation="relu"))
model.add(layers.Dropout(0.1))
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         6000300   
_________________________________________________________________
dense_6 (Dense)              (None, None, 128)         38528     
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
dense_7 (Dense)              (None, None, 64)          8256      
_________________________________________________________________
dense_8 (Dense)              (None, None, 1)           65        
Total params: 6,047,149
Trainable params: 6,047,149
Non-trainable params: 0
_________________________________________________________________


In [25]:
# Training and Testing split
# validation_split = 0.2
# num_validation_samples = int(validation_split * len(train_samples))
# x_val = np.array(train_samples[:-num_validation_samples])
# x_train = np.array(train_samples[-num_validation_samples:])
# y_val = np.array(train_labels[:-num_validation_samples])
# y_train = np.array(train_labels[-num_validation_samples:])

x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
model.fit(x_train, y_train, batch_size=128, epochs=10,validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10