In [35]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers

import pandas as pd
import string
import numpy as np
import re

import nltk
from nltk import word_tokenize

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
print("Tensorflow Version",tf.__version__)

Tensorflow Version 2.5.0-rc0


# Loading the raw dataset

In [3]:
data = pd.read_csv('data.csv',encoding='utf8')

# First filter out those rows which
# does not contain any data
data = data.dropna(how = 'all')
  
# Filter all rows having empty text
data.drop(data[data['text'] == ' ' ].index, inplace = True)

data.head(10) 

Unnamed: 0.1,Unnamed: 0,text,label
0,0,"GAZA/CAIRO (Reuters) - Palestinian factions, i...",1
1,1,HARARE (Reuters) - Zimbabwean police arrested ...,1
2,2,Ronna Romney McDaniel is the Chairman of the M...,0
3,3,WASHINGTON (Reuters) - A small group of Republ...,1
4,4,"THE HUNTINGTON BEACH, CA RALLY WAS PRETTY BIG ...",0
5,5,BALTIMORE (Reuters) - A Maryland state senator...,1
6,6,Remember when the left would have been ashamed...,0
7,7,CLEVELAND (Reuters) - As Republicans spilled i...,1
9,9,Beware of morning talk show hosts turned news...,0
10,10,Senator Bernie Sanders had the crowd roaring w...,0


# Data Preprocessing

In [4]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text): 
    url_pattern  = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)
 # converting return value from list to string



def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>2))]) 
    
    return text2.lower()

In [5]:
# Spliting data into train and test
split = 0.15
num_testing_samples = int(split * len(data))
train_data = data[:-num_testing_samples].copy()
test_data = data[-num_testing_samples:].copy()

In [6]:
# Training data

train_data['text'] = train_data['text'].apply(remove_emoji)
train_data['text'] = train_data['text'].apply(remove_url)
train_data['text'] = train_data['text'].apply(clean_text)

print("----------Training Data------------")
print(train_data['label'].value_counts())
print("Train data len :",len(train_data))
print("-----------------------------------")

# Creating a cloumn for each row which represent the length of sentence in the text coloumn
train_data['Num_words_text'] = train_data['text'].apply(lambda x:len(str(x).split())) 

# Getting the max sentence length from all the rows
max_train_sentence_length  = train_data['Num_words_text'].max()
print("Max Sentence Length (Train Data):",max_train_sentence_length)

############################################################################################################################

# Testing data

test_data['text'] = test_data['text'].apply(remove_emoji)
test_data['text'] = test_data['text'].apply(remove_url)
test_data['text'] = test_data['text'].apply(clean_text)

print("----------Testing Data------------")
print(test_data['label'].value_counts())
print("Test data len :",len(test_data))
print("-----------------------------------")

# Creating a cloumn for each row which represent the length of sentence in the text coloumn
test_data['Num_words_text'] = test_data['text'].apply(lambda x:len(str(x).split())) 

# Getting the max sentence length from all the rows
max_test_sentence_length  = test_data['Num_words_text'].max()
print("Max Sentence Length (Test Data):",max_test_sentence_length)

----------Training Data------------
0    19455
1    18176
Name: label, dtype: int64
Train data len : 37631
-----------------------------------
Max Sentence Length (Train Data): 6559
----------Testing Data------------
0    3400
1    3240
Name: label, dtype: int64
Test data len : 6640
-----------------------------------
Max Sentence Length (Test Data): 5738


In [7]:
train_data.head(10)

Unnamed: 0.1,Unnamed: 0,text,label,Num_words_text
0,0,gazacairo reuters palestinian factions includi...,1,268
1,1,harare reuters zimbabwean police arrested acti...,1,285
2,2,ronna romney mcdaniel the chairman the michiga...,0,244
3,3,washington reuters small group republican lawm...,1,524
4,4,the huntington beach rally was pretty big with...,0,84
5,5,baltimore reuters maryland state senator backi...,1,329
6,6,remember when the left would have been ashamed...,0,220
7,7,cleveland reuters republicans spilled into cle...,1,635
9,9,beware morning talk show hosts turned news per...,0,253
10,10,senator bernie sanders had the crowd roaring w...,0,307


# Tokenization

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer

# we are considering top 50K unique words from our dataset
num_words = 50000 

# Defining the tokenizer
tokenizer = Tokenizer(num_words=num_words,oov_token="unk")
tokenizer.fit_on_texts(train_data['text'].tolist())

# Working of tokenizer with an exmple
print(str(tokenizer.texts_to_sequences(['xyz how are you'])))

[[1, 79, 16, 26]]


In [18]:
from sklearn.model_selection import train_test_split

# Spliting into training and validation data
X_train, X_valid, y_train, y_valid = train_test_split(train_data['text'].tolist(),\
                                                      train_data['label'].tolist(),\
                                                      test_size=0.1,\
                                                      stratify = train_data['label'].tolist(),\
                                                      random_state=0)


# Tokenization of training testing and validation textual data
x_train = np.array( tokenizer.texts_to_sequences(X_train) ,dtype="object" )
x_valid = np.array( tokenizer.texts_to_sequences(X_valid) ,dtype="object")
x_test  = np.array( tokenizer.texts_to_sequences(test_data['text'].tolist()),dtype="object" )

# Padding the tokens based on the Max Sentence Length mentioned before
x_train = pad_sequences(x_train, padding='post', maxlen=6570)
x_valid = pad_sequences(x_valid, padding='post', maxlen=6570)
x_test = pad_sequences(x_test, padding='post', maxlen=6570)

print(x_train[0])

[ 83  45 973 ...   0   0   0]


In [30]:
train_ds = tf.data.Dataset.from_tensor_slices((x_train,y_train))
valid_ds = tf.data.Dataset.from_tensor_slices((x_valid,y_valid))
test_ds = tf.data.Dataset.from_tensor_slices((x_test,test_data["label"]))

In [31]:
count =0
print('======Train dataset ====')
for value,label in train_ds:
    count += 1
    print(value,label)
    if count==1:
        break
count =0
print('======Validation dataset ====')
for value,label in valid_ds:
    count += 1
    print(value,label)
    if count==1:
        break
count = 0
print('======Test dataset ====')
for value,label in test_ds:
    count += 1
    print(value,label)
    if count==1:
        break  

tf.Tensor([ 83  45 973 ...   0   0   0], shape=(6570,), dtype=int32) tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([ 961 5547 2543 ...    0    0    0], shape=(6570,), dtype=int32) tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor([    2 37539   726 ...     0     0     0], shape=(6570,), dtype=int32) tf.Tensor(0, shape=(), dtype=int64)


# Embedding Layer

In [36]:
max_features = 50000
embedding_dim = 300
sequence_length = 6570

embedding_layer = tf.keras.layers.Embedding(max_features +1, 
                                            embedding_dim, 
                                            input_length=sequence_length,
                                            embeddings_regularizer = regularizers.l2(0.0005)
                                           )

# Model

In [58]:
window_size = 4

model = tf.keras.Sequential()

model.add(embedding_layer)   

model.add(tf.keras.layers.Conv1D(128,window_size, 
                                 activation='relu',
                                 kernel_regularizer = regularizers.l2(0.0005),
                                 bias_regularizer = regularizers.l2(0.0005),))    

model.add(tf.keras.layers.GlobalMaxPooling1D())

model.add(tf.keras.layers.Dense(32, 
                                activation='relu',
                                kernel_regularizer=regularizers.l2(0.001),
                                bias_regularizer=regularizers.l2(0.001),))

model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.Dense(16, 
                                activation='relu',
                                kernel_regularizer=regularizers.l2(0.001),
                                bias_regularizer=regularizers.l2(0.001),))

model.add(tf.keras.layers.Dense(1, 
                                activation='sigmoid',
                                kernel_regularizer=regularizers.l2(0.001),
                                bias_regularizer=regularizers.l2(0.001),))
                               
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 6570, 300)         15000300  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 6567, 128)         153728    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 32)                4128      
_________________________________________________________________
dropout_7 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_12 (Dense)             (None, 1)                

In [59]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
tf.keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [60]:
epochs = 10
model.fit(train_ds.shuffle(2000).batch(128),
                    epochs= epochs ,
                    validation_data=valid_ds.batch(128),
                    verbose=1)

Epoch 1/10

KeyboardInterrupt: 