In [10]:
import numpy as np
import pandas as pd
import regex as re
import nltk
import itertools
from collections import Counter

In [3]:
sar_acc = pd.read_json('Sarcasm_Headlines_Dataset.json',lines=True)
sar_acc['source'] = sar_acc['article_link'].apply(lambda x: re.findall(r'\w+', x)[2])
sar_acc.head()

Unnamed: 0,article_link,headline,is_sarcastic,source
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,huffingtonpost
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,huffingtonpost
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,theonion
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,theonion
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,huffingtonpost


In [7]:
# Tokenization
sar_det = sar_acc[sar_acc.is_sarcastic==1]
sar_det.reset_index(drop=True, inplace=True)
acc_det = sar_acc[sar_acc.is_sarcastic==0]
acc_det.reset_index(drop=True, inplace=True)

# Tokenizing the Headlines of Sarcasm
sar_news = []
for rows in range(0, sar_det.shape[0]):
    head_txt = sar_det.headline[rows]
    head_txt = head_txt.split(" ")
    sar_news.append(head_txt)

#Converting into single list for Sarcasm
import itertools
sar_list = list(itertools.chain(*sar_news))

# Tokenizing the Headlines of Acclaim
acc_news = []
for rows in range(0, acc_det.shape[0]):
    head_txt = acc_det.headline[rows]
    head_txt = head_txt.split(" ")
    acc_news.append(head_txt)
    
#Converting into single list for Acclaim
acc_list = list(itertools.chain(*acc_news))

In [8]:
# !python3 -m nltk.downloader stopwords

[nltk_data] Downloading package stopwords to /home/drex/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
from nltk.corpus import stopwords
stopwords = list(stopwords.words("english"))

sar_list_restp = [word for word in sar_list if word.lower() not in stopwords]
acc_list_restp = [word for word in acc_list if word.lower() not in stopwords]

In [14]:
sar_count, acc_count = Counter(sar_list_restp), Counter(acc_list_restp)
most_sar, most_acc = sar_count.most_common(10), acc_count.most_common(10)
print("most common sarcastic words : " + str(most_sar))
print("most common acclaimed words : " + str(most_acc))

most common sarcastic words : [('man', 1021), ('new', 821), ('area', 477), ('report:', 360), ('woman', 290), ('one', 253), ('time', 213), ('still', 208), ('day', 207), ('trump', 200)]
most common acclaimed words : [('trump', 957), ('new', 664), ('donald', 453), ("trump's", 364), ('says', 346), ('women', 240), ('one', 234), ('u.s.', 223), ('first', 220), ('make', 209)]


In [15]:
# Lemmatization

In [38]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import accuracy_score

In [20]:
X = sar_acc.headline
Y = sar_acc.is_sarcastic
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)

In [27]:
tk = tf.keras.preprocessing.text.Tokenizer(num_words=1000)
tk.fit_on_texts(X_train)
seqs = tk.texts_to_sequences(X_train)
max_len = 100
seqs_mat = tf.keras.preprocessing.sequence.pad_sequences(seqs,maxlen=max_len)

In [35]:
def model_def():
    inputs = tf.keras.layers.Input(name='inputs', shape=[max_len])
    layer = tf.keras.layers.Embedding(1000,50,input_length=max_len)(inputs)
    layer = tf.keras.layers.LSTM(64)(layer)
    layer = tf.keras.layers.Dense(256,name='FC1')(layer)
    layer = tf.keras.layers.Activation('relu')(layer)
    layer = tf.keras.layers.Dropout(0.2)(layer)
    layer = tf.keras.layers.Dense(1,name='out_layer')(layer)
    
    model = tf.keras.models.Model(inputs=inputs,outputs=layer)
    return model

In [36]:
model = model_def()
model.summary()
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=['accuracy'])

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 100)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 50)           50000     
_________________________________________________________________
lstm (LSTM)                  (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257 

In [37]:
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_acc', 
    mode='max',
    patience=6
)
# increase epochs and other steps for better training
model.fit(seqs_mat,Y_train,batch_size=100,epochs=5,
          validation_split=0.1,callbacks=[es])

Train on 19230 samples, validate on 2137 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f7d3d2e0a58>

In [45]:
def predict_sarcasm(user_seq):
#     prediction
    prob = model.predict(user_seq)
    probability = np.mean(prob, axis=0)

    if probability > 0.5:
        return("Sarcastic")
    elif probability < 0.5:
        return("Not Sarcastic")
    elif probability == 0.5:
        return("Neutral")
    

In [47]:
def user_text_processing(user_text):
    user_text = user_text.split()
    user_text = [word.lower() for word in user_text if word not in stopwords]
#     user_text = [lemm.lemmatize(word) for word in user_text]
    user_text
    user_seq = np.array(user_text)
    user_seq = tk.texts_to_sequences(user_seq)
    user_seq = tf.keras.preprocessing.sequence.pad_sequences(user_seq,maxlen=max_len)

    return user_seq

In [50]:
est_sequences = tok.texts_to_sequences(X_test)

accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

NameError: name 'test_sequences_matrix' is not defined

In [46]:
user_text = 'state population to double by 2040, babies to blame'
user_seq = user_text_processing(user_text)
user_seq
prediction = predict_sarcasm(user_seq)
print(f"Sentence '{user_text}' is of '{prediction}' nature")


Sentence 'state population to double by 2040, babies to blame' is of 'Sarcastic' nature
