In [1]:
!pwd

/content


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#necessary imports
import tensorflow as tf
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string 
import re
import nltk
import os

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
#reading dataframe
df1 = pd.read_json("/content/Sarcasm_Headlines_Dataset.json",lines=True)
df2 = pd.read_json("/content/Sarcasm_Headlines_Dataset_v2.json",lines=True)


In [6]:
df1.count()


article_link    26709
headline        26709
is_sarcastic    26709
dtype: int64

In [7]:
df2.count()

is_sarcastic    28619
headline        28619
article_link    28619
dtype: int64

In [8]:
df = pd.concat([df1,df2])
df.count()

article_link    55328
headline        55328
is_sarcastic    55328
dtype: int64

In [9]:
#removing duplicates
df = df.drop_duplicates().reset_index(drop=True) 

In [10]:
df.count()

article_link    28617
headline        28617
is_sarcastic    28617
dtype: int64

In [11]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [12]:
def clean_text(text):

  text = text.lower() #converted to lowercase

  pattern = re.compile('https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') #removing link
  text = pattern.sub('', text) #replacing link with whitespace
  emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # removing emoji, symbols, flags
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
  text = emoji.sub('',text)
  text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text) #additional special characters removed
  return text

In [13]:
def token_word(dframe):

  head_line = list() #new list
  lines = dframe['headline'].values.tolist() #df values to list

  for line in lines:
    line = clean_text(line) #passing each insatnce of corpus 
    tokenize = word_tokenize(line) #NLTK tokenize function
    pure_words = [word for word in tokenize if word.isalpha()] #keeping only alphabets
    stop_words = set(stopwords.words("english")) #loading 'English' stopwords
    filtered_words = [ word for word in pure_words if not word in stop_words] #removing all stopwords
    head_line.append(filtered_words) #added to the list

  return head_line

In [14]:
head_lines = token_word(df)
head_lines[:5]

[['former',
  'versace',
  'store',
  'clerk',
  'sues',
  'secret',
  'black',
  'code',
  'minority',
  'shoppers'],
 ['roseanne',
  'revival',
  'catches',
  'thorny',
  'political',
  'mood',
  'better',
  'worse'],
 ['mom',
  'starting',
  'fear',
  'sons',
  'web',
  'series',
  'closest',
  'thing',
  'grandchild'],
 ['boehner',
  'wants',
  'wife',
  'listen',
  'come',
  'alternative',
  'debtreduction',
  'ideas'],
 ['jk', 'rowling', 'wishes', 'snape', 'happy', 'birthday', 'magical', 'way']]

In [15]:
tokenizer_obj = tf.keras.preprocessing.text.Tokenizer() #tokenizer object
tokenizer_obj.fit_on_texts(head_lines) #Tokenizer fit method
word_index = tokenizer_obj.word_index #to count words
print(f'Unique words ', len(word_index))

Unique words  28565


In [16]:
sequences = tokenizer_obj.texts_to_sequences(head_lines)
lines_pad = tf.keras.preprocessing.sequence.pad_sequences(sequences=sequences, maxlen=25,padding='post') #to make sure each input has same length

In [17]:
sentiment = df['is_sarcastic'].values #extract output values

In [18]:
#shuffled to make sure no bias
dimen = np.arange(lines_pad.shape[0]) 
np.random.shuffle(dimen)
lines_pad = lines_pad[dimen]
sentiment = sentiment[dimen]

In [19]:
test_sample = int(0.2 * lines_pad.shape[0])

In [20]:
#train-test split
x_train = lines_pad[:-test_sample]
y_train = sentiment[:-test_sample]
x_test = lines_pad[-test_sample:]
y_test = sentiment[-test_sample:]

In [21]:
print(f' x_train = {x_train.shape} y_train = {y_train.shape} x_test = {x_test.shape} y_test = {y_test.shape} ')

 x_train = (22894, 25) y_train = (22894,) x_test = (5723, 25) y_test = (5723,) 


In [22]:
ls

[0m[01;34mdrive[0m/        Sarcasm_Headlines_Dataset.json
[01;34msample_data[0m/  Sarcasm_Headlines_Dataset_v2.json


In [23]:
os.chdir("drive/My Drive/sarcasm")

In [25]:
#creating dict. with keys:word and values:vector
embeddings_index = {}
embedding_dim = 100
f = open('glove.twitter.27B.100d.txt', encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [26]:
#creating embedding matrix to use in embedding layer
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
c = 0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        c+=1
        embedding_matrix[i] = embedding_vector
print(c)

24806


In [27]:
type(word_index)

dict

In [64]:
#Model architecture with pre-trained Glove embedding
model = tf.keras.models.Sequential([
                                    tf.keras.layers.Embedding(input_dim=len(word_index)+1,output_dim=embedding_dim,input_length=25,
                                                              embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),trainable=False),
                                    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64,dropout=0.3,recurrent_dropout=0.25)),
                                    tf.keras.layers.Dense(units=1,activation='sigmoid')
])

In [65]:
optimizer= tf.keras.optimizers.Adam(learning_rate=0.001)

In [66]:
model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics='acc')

In [67]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 100)           2856600   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               84480     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,941,209
Trainable params: 84,609
Non-trainable params: 2,856,600
_________________________________________________________________


In [68]:
history= model.fit(x=x_train,y=y_train,batch_size=32,epochs=10,validation_data=(x_test,y_test),verbose=2)

Epoch 1/10
716/716 - 58s - loss: 0.5282 - acc: 0.7346 - val_loss: 0.4600 - val_acc: 0.7839
Epoch 2/10
716/716 - 51s - loss: 0.4585 - acc: 0.7815 - val_loss: 0.4352 - val_acc: 0.7999
Epoch 3/10
716/716 - 51s - loss: 0.4182 - acc: 0.8050 - val_loss: 0.4042 - val_acc: 0.8144
Epoch 4/10
716/716 - 51s - loss: 0.3912 - acc: 0.8191 - val_loss: 0.4025 - val_acc: 0.8130
Epoch 5/10
716/716 - 51s - loss: 0.3638 - acc: 0.8376 - val_loss: 0.3888 - val_acc: 0.8209
Epoch 6/10
716/716 - 52s - loss: 0.3467 - acc: 0.8444 - val_loss: 0.4030 - val_acc: 0.8115
Epoch 7/10
716/716 - 51s - loss: 0.3287 - acc: 0.8532 - val_loss: 0.3813 - val_acc: 0.8282
Epoch 8/10
716/716 - 51s - loss: 0.3138 - acc: 0.8630 - val_loss: 0.3772 - val_acc: 0.8295
Epoch 9/10
716/716 - 51s - loss: 0.3011 - acc: 0.8681 - val_loss: 0.3881 - val_acc: 0.8317
Epoch 10/10
716/716 - 51s - loss: 0.2860 - acc: 0.8762 - val_loss: 0.3821 - val_acc: 0.8319


In [69]:
model.save('my_model.h5')