In [344]:
import pandas as pd
import numpy as np

In [347]:
#loading the train data
df_train = pd.read_json("train.json", lines = True)
df_train.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [351]:
#To know the percentage of both sarcasm and non-sarcasm headlines

import plotly as py
from plotly import graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

#before cleaning the data
# Making pie chart to compare the numbers of sarcastic and not-sarcastic headlines
labels = ['Sarcastic', 'Not Sarcastic']
count_sarcastic = len(df_train[df_train['is_sarcastic']==1])
count_notsarcastic = len(df_train[df_train['is_sarcastic']==0])
values = [count_sarcastic, count_notsarcastic]
# values = [20,50]

trace = go.Pie(labels=labels,
               values=values,
               textfont=dict(size=19, color='#000000'),
               marker=dict(
                   colors=['#FFFF00', '#2424FF'] 
               )
              )

layout = go.Layout(title = '<b>Sarcastic vs Not Sarcastic</b>')
data = [trace]
fig = go.Figure(data=data, layout=layout)

iplot(fig)

In [352]:
#load the test data
df_test = pd.read_json("test.json", lines = True)
df_test.head()

Unnamed: 0,article_link,headline
0,https://www.huffingtonpost.com/entry/trump-org...,trump letter resigning from hundreds of compan...
1,https://www.theonion.com/aquarium-unveils-floa...,aquarium unveils 'floating carcasses of the pa...
2,https://entertainment.theonion.com/early-human...,early humans finally drunk enough to invent da...
3,https://www.huffingtonpost.com/entry/colorados...,colorado's new revenge porn statute is good la...
4,https://www.huffingtonpost.com/entry/donate-eg...,i donated my eggs so i could travel the world


In [353]:
df3 = pd.DataFrame(df_train, columns = ['article_link', 'headline'])
df3.head()

Unnamed: 0,article_link,headline
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...


In [354]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24209 entries, 0 to 24208
Data columns (total 2 columns):
article_link    24209 non-null object
headline        24209 non-null object
dtypes: object(2)
memory usage: 378.3+ KB


In [355]:
#merge both train and test data columns(article_link, headline) 
#so that preprocessing will have to do only one time

frame = [df3, df_test]
df = pd.concat(frame, ignore_index=True)
df.head()

Unnamed: 0,article_link,headline
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up..."
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...


In [289]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26709 entries, 0 to 2499
Data columns (total 2 columns):
article_link    26709 non-null object
headline        26709 non-null object
dtypes: object(2)
memory usage: 626.0+ KB


In [356]:
#check the headline column
for i,headline in enumerate (df['headline'], 1):
    if i > 20:
        break
    else:
        print(i, headline)

1 former versace store clerk sues over secret 'black code' for minority shoppers
2 the 'roseanne' revival catches up to our thorny political mood, for better and worse
3 mom starting to fear son's web series closest thing she will have to grandchild
4 boehner just wants wife to listen, not come up with alternative debt-reduction ideas
5 j.k. rowling wishes snape happy birthday in the most magical way
6 advancing the world's women
7 the fascinating case for eating lab-grown meat
8 this ceo will send your kids to school, if you work for his company
9 top snake handler leaves sinking huckabee campaign
10 friday's morning email: inside trump's presser for the ages
11 airline passengers tackle man who rushes cockpit in bomb threat
12 facebook reportedly working on healthcare features and apps
13 north korea praises trump and urges us voters to reject 'dull hillary'
14 actually, cnn's jeffrey lord has been 'indefensible' for a while
15 barcelona holds huge protest in support of refugees
16 n

In [357]:
#To remove punctuations, digits and numbers
#Text cleaning

import string
from string import digits, punctuation

hl_cleansed = []
for hl in df['headline']:
    #Remove punctuations
    clean = hl.translate(str.maketrans('', '', punctuation))
    #Remove digits/numbers
    clean = clean.translate(str.maketrans('', '', digits))
    hl_cleansed.append(clean)
    
# View comparison
print('Original texts :')
print(df['headline'][1])
print('\nAfter cleansed :')
print(hl_cleansed[1])


Original texts :
the 'roseanne' revival catches up to our thorny political mood, for better and worse

After cleansed :
the roseanne revival catches up to our thorny political mood for better and worse


In [358]:
# Tokenization process
hl_tokens = []
for hl in hl_cleansed:
    hl_tokens.append(hl.split())

# View Comparison
index = 1
print('Before tokenization :')
print(hl_cleansed[index])
print('\nAfter tokenization :')
print(hl_tokens[index])

Before tokenization :
the roseanne revival catches up to our thorny political mood for better and worse

After tokenization :
['the', 'roseanne', 'revival', 'catches', 'up', 'to', 'our', 'thorny', 'political', 'mood', 'for', 'better', 'and', 'worse']


In [359]:
#lemmatization
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# Init Lemmatizer
lemmatizer = WordNetLemmatizer()

hl_lemmatized = []
for tokens in hl_tokens:
    lemm = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens]
    hl_lemmatized.append(lemm)
    
# Example comparison
word_1 = ['skyrim','dragons', 'are', 'having', 'parties']
word_2 = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_1]
print('Before lemmatization :\t',word_1)
print('After lemmatization :\t',word_2)

Before lemmatization :	 ['skyrim', 'dragons', 'are', 'having', 'parties']
After lemmatization :	 ['skyrim', 'dragon', 'be', 'have', 'party']


In [360]:
#preparing the data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Vectorize and convert text into sequences
max_features = 2000
max_token = len(max(hl_lemmatized))
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(hl_lemmatized)
sequences = tokenizer.texts_to_sequences(hl_lemmatized)
X = pad_sequences(sequences, maxlen=max_token)
print(X)

[[   0    0    0 ... 1890    7 1985]
 [   0    0    0 ...  166    9  145]
 [   0    0  105 ...   41   14    1]
 ...
 [   0    0    0 ...    0    9   73]
 [   0    0    0 ...  686  715  425]
 [   0    0    0 ...  432    7    3]]


In [361]:
#To check the convertion
index = 2
print('Before :')
print(hl_lemmatized[index],'\n')
print('After sequences convertion :')
print(sequences[index],'\n')
print('After padding :')
print(X[index])
X.shape

Before :
['mom', 'start', 'to', 'fear', 'son', 'web', 'series', 'closest', 'thing', 'she', 'will', 'have', 'to', 'grandchild'] 

After sequences convertion :
[105, 219, 1, 504, 235, 1986, 620, 70, 159, 41, 14, 1] 

After padding :
[   0    0  105  219    1  504  235 1986  620   70  159   41   14    1]


(26709, 14)

In [362]:
#Spiliting of X to train the model
#As X was in concat form of both train and test file 
X1 = X[0:24209,:]
X2 = X[24209:,:]

In [363]:
X1.shape

(24209, 14)

In [364]:
#Spliting of the training and testing data to build a the model
from sklearn.model_selection import train_test_split

Y = df_train['is_sarcastic'].values
Y = np.vstack(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X1,Y,test_size=0.3, random_state = 42)

In [365]:
Y.shape

(24209, 1)

In [366]:
X2.shape

(2500, 14)

In [367]:
X_train.shape


(16946, 14)

In [368]:
Y_train.shape

(16946, 1)

In [369]:
#model bulding
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim = 64

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = max_token))
model.add(LSTM(96, dropout=0.2, recurrent_dropout=0.2, activation='relu'))
# model.add(Dense(128))
# model.add(Activation('relu'))
# model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 14, 64)            128000    
_________________________________________________________________
lstm_10 (LSTM)               (None, 96)                61824     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 97        
_________________________________________________________________
activation_10 (Activation)   (None, 1)                 0         
Total params: 189,921
Trainable params: 189,921
Non-trainable params: 0
_________________________________________________________________
None


In [370]:
#training process
epoch = 8
batch_size = 200
model.fit(X_train, Y_train, epochs = epoch, batch_size=batch_size, verbose = 2)

Epoch 1/8
 - 24s - loss: 0.5963 - acc: 0.6497
Epoch 2/8
 - 4s - loss: 0.3923 - acc: 0.8307
Epoch 3/8
 - 4s - loss: 0.3419 - acc: 0.8516
Epoch 4/8
 - 4s - loss: 0.3199 - acc: 0.8617
Epoch 5/8
 - 4s - loss: 0.3055 - acc: 0.8708
Epoch 6/8
 - 4s - loss: 0.2938 - acc: 0.8778
Epoch 7/8
 - 4s - loss: 0.2865 - acc: 0.8776
Epoch 8/8
 - 4s - loss: 0.2716 - acc: 0.8855


<keras.callbacks.History at 0x7fc5e61335f8>

In [371]:
#test model
loss, acc = model.evaluate(X_test, Y_test, verbose=2)
print("Overall scores")
print("Loss\t\t: ", round(loss, 3))
print("Accuracy\t: ", round(acc, 3))


Overall scores
Loss		:  0.397
Accuracy	:  0.834


In [372]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_test)):
    
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.around(result) == np.around(Y_test[x]):
        if np.around(Y_test[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.around(Y_test[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1

In [373]:
print("Sarcasm accuracy\t: ", round(pos_correct/pos_cnt*100, 3),"%")
print("Non-sarcasm accuracy\t: ", round(neg_correct/neg_cnt*100, 3),"%")

Sarcasm accuracy	:  81.455 %
Non-sarcasm accuracy	:  84.83 %


In [374]:
ypred = model.predict(X2)


In [375]:
ypred1 = pd.DataFrame(ypred)

In [376]:
ypred1.to_csv('pred.csv')