#### Importing required libraries

In [1]:
import numpy as np 
import pandas as pd
import json
import nltk
import tensorflow as tf
import numpy as np
import urllib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import zipfile
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import seaborn as sns



#### Loading Datasets

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

twitter_df = pd.concat([train,test])

twitter_df['class'] = twitter_df['class'].replace(['irony','figurative','regular'],int('0'))
twitter_df['class'] = twitter_df['class'].replace(['sarcasm'],int('1'))


twitter_df = twitter_df.rename(columns={'tweets':'sentence','class':'is_sarcastic'})

twitter_df.head()

Unnamed: 0,sentence,is_sarcastic
0,Be aware dirty step to get money #staylight ...,0.0
1,#sarcasm for #people who don't understand #diy...,0.0
2,@IminworkJeremy @medsingle #DailyMail readers ...,0.0
3,@wilw Why do I get the feeling you like games?...,0.0
4,-@TeacherArthurG @rweingarten You probably jus...,0.0


#### Twitter Data Cleaning

In [3]:
# input dataframe
def keep_uniques(data):
    data = data.drop_duplicates('sentence')
    return data

# input cell (tweet)
def remove_url(text):
    text = str(text)
    url = re.compile(r'[https|http]?://\S+|www\.\S+')
    return url.sub(r'', text) 

# input cell (tweet)
def remove_emoji(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
                   "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

# input cell (tweet)
def remove_tag(text):
    tag = re.compile(r"(@[a-zA-Z0-9._]+)")
    return tag.sub(r'', text)

# input cell (tweet)
def remove_splsymbols(text):
    sym = re.compile(r"[$|%|^|&|*|=|;]+")
    return sym.sub(r'', text)

twitter_df = keep_uniques(twitter_df)
twitter_df['sentence'] = twitter_df['sentence'].apply(lambda x: remove_url(x))
twitter_df['sentence'] = twitter_df['sentence'].apply(lambda x: remove_emoji(x))
twitter_df['sentence'] = twitter_df['sentence'].apply(lambda x: remove_tag(x))
twitter_df['sentence'] = twitter_df['sentence'].apply(lambda x: remove_splsymbols(x))

twitter_df.head()

Unnamed: 0,sentence,is_sarcastic
0,Be aware dirty step to get money #staylight ...,0.0
1,#sarcasm for #people who don't understand #diy...,0.0
2,#DailyMail readers being sensible as always ...,0.0
3,Why do I get the feeling you like games? #sar...,0.0
4,- You probably just missed the text. #sarcastic,0.0


#### Loading Headlines Dataset

In [4]:
def open_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(open_data('Sarcasm_Headlines_Dataset.json'))

In [5]:
headline_with_stopwords = []
label = []

for line in data:
    headline_with_stopwords.append(line['headline'])
    label.append(line['is_sarcastic'])

In [6]:
headline_with_stopwords[1], label[1]

("the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 0)

#### Removing stopwords

In [7]:
stopwords = nltk.corpus.stopwords.words('english')

In [8]:
headline = []
for line in headline_with_stopwords:
    temp = []
    for word in line.split():
        if word not in stopwords:
            temp.append(word)
    join = ' '.join([str(ele) for ele in temp])
    headline.append(join)

In [9]:
headline[1]

"'roseanne' revival catches thorny political mood, better worse"

In [10]:
data_df = pd.read_json('Sarcasm_Headlines_Dataset.json',lines=True)
data_df = data_df.rename(columns = {'headline': 'sentence'})
data_df.drop('article_link',axis=1,inplace=True)

In [11]:
# Merging two datasets

sarcasm_df = pd.concat([data_df,twitter_df])
sarcasm_df

Unnamed: 0,sentence,is_sarcastic
0,former versace store clerk sues over secret 'b...,0.0
1,the 'roseanne' revival catches up to our thorn...,0.0
2,mom starting to fear son's web series closest ...,1.0
3,"boehner just wants wife to listen, not come up...",1.0
4,j.k. rowling wishes snape happy birthday in th...,0.0
...,...,...
8123,Why yes I will totally submit my photos to a s...,1.0
8124,Test on a Saturday! Thank you uni! #sarcasm @ ...,1.0
8125,Listening to 's Misery isn't at all disconcert...,1.0
8126,There you go being kind again #sarcasm #stand...,1.0


In [12]:
sarcasm_df.to_csv('sarcasm_df.csv',index=True)

#### Data Preprocessing

In [13]:
len(sarcasm_df)

100108

In [14]:
embedding_dim = 100
max_length = 32
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 80000

In [32]:
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(sarcasm_df['sentence'].tolist())
word_index = tokenizer.word_index

sentence_sequences = tokenizer.texts_to_sequences(sarcasm_df['sentence'].tolist())
sentence_padded = pad_sequences(sentence_sequences, maxlen = max_length, padding = padding_type, truncating=trunc_type)

In [34]:
sentence_padded

array([[   18,    39,  2126, ...,     0,     0,     0],
       [  282,   135,  1809, ...,     0,     0,     0],
       [ 1289,     9,     2, ...,     0,     0,     0],
       ...,
       [27748,   893,  3879, ...,     0,     0,     0],
       [  160,  6019,  1895, ...,     0,     0,     0],
       [ 1730,  2298,     6, ...,     0,     0,     0]], dtype=int32)

In [39]:
sentence_padded = np.array(sentence_padded)
print(type(sentence_padded))
import scipy.sparse as sparse

#sarcasm_df = sarcasm_df.drop(['sentence_padded','tokenizd_sentence'],axis=1)
sarcasm_df['tokenized_sentence'] = sparse.coo_matrix(sentence_padded).toarray().tolist()

<class 'numpy.ndarray'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Model - Default Embedding, LSTM

In [18]:
vocab_size = len(word_index)
tf.keras.backend.clear_session()
tf.random.set_seed(51)
np.random.seed(51)

# Defining Early stopping to stop training if validation accuracy does not improve within five epochs
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    patience=5,
    verbose=1,
    restore_best_weights=True,
)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    #tf.keras.layers.Dropout(0.2),
    #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences = True)),
    #tf.keras.layers.GlobalAveragePooling1D(),
    #tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam()
model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 100)           8374600   
_________________________________________________________________
dropout (Dropout)            (None, 32, 100)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 256)           234496    
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 256)           0         
_________________________________________________________________
dense (Dense)                (None, 32, 128)           32896     
_________________________________________________________________
dense_1 (Dense)              (None, 32, 1)             129       
Total params: 8,642,121
Trainable params: 8,642,121
Non-trainable params: 0
______________________________________________

In [20]:
sarcasm_df = sarcasm_df.sample(frac=1)
sarcasm_df

Unnamed: 0,sentence,is_sarcastic,tokenized_sentence
66528,it was obvious they wanted more size at WR wh...,1.0,"[18, 39, 2146, 49, 683, 68, 2439, 23, 10395, 4..."
70824,I've been guilty of posting #sarcastic notes l...,1.0,"[284, 135, 1814, 8, 2183, 96, 3963, 45, 167, 1..."
38987,"#DonaldTrump in the middle of the stage, other...",0.0,"[1297, 9, 2, 605, 8, 2, 1072, 179, 689, 663, 1..."
4180,There is a 100 chance of Pickleman’s Pizza bei...,0.0,"[109, 11, 6, 698, 985, 8, 81832, 1227, 95, 510..."
10276,a look at transgender sex workers living in china,0.0,"[6, 175, 23, 2218, 391, 1268, 579, 9, 768, 0, ..."
...,...,...,...
18117,bush not heard from for over a month,1.0,"[528, 27, 584, 32, 13, 108, 6, 735, 0, 0, 0, 0..."
23165,Pregnant Chicken NAILS IT! #parenthood #irony...,0.0,"[1710, 1395, 4329, 18, 1226, 7, 5, 0, 0, 0, 0,..."
48717,1990s: UN bodies on #drugs amp #crime merge to...,0.0,"[32535, 893, 3875, 14, 58, 60, 1057, 8106, 3, ..."
16869,gop promotes carly fiorina to male candidate a...,1.0,"[160, 5878, 1889, 1428, 3, 1450, 691, 83, 1157..."


In [21]:
sarcasm_df = sarcasm_df.dropna()

In [35]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

k = 10

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

X = sarcasm_df.iloc[:,-1]
y = sarcasm_df['is_sarcastic'].tolist()

X = np.array(X.tolist())
y = np.array([int(i) for i in y])

for train_index, test_index in folds.split(X,y):
    print(train_index,test_index)
    
    X_train, X_test = X[train_index],X[test_index]
    y_train, y_test = y[train_index] , y[test_index]
    
    model.fit(X_train,y_train,batch_size = 128,validation_data=(X_test,y_test))

[     0      1      2 ... 100095 100096 100097] [     4      9     31 ... 100083 100098 100099]
[     0      1      2 ... 100097 100098 100099] [     8     12     27 ... 100047 100084 100093]
[     0      1      2 ... 100097 100098 100099] [    16     23     25 ... 100080 100082 100095]
[     2      3      4 ... 100097 100098 100099] [     0      1      5 ... 100074 100089 100096]
[     0      1      2 ... 100097 100098 100099] [    13     29     42 ... 100070 100079 100091]
[     0      1      2 ... 100097 100098 100099] [    11     48     72 ... 100069 100087 100090]
[     0      1      4 ... 100097 100098 100099] [     2      3     10 ... 100088 100092 100094]
[     0      1      2 ... 100096 100098 100099] [    17     20     24 ... 100072 100086 100097]
[     0      1      2 ... 100097 100098 100099] [    14     15     21 ... 100065 100066 100081]
[     0      1      2 ... 100097 100098 100099] [     7     53     77 ... 100053 100067 100085]
