##Sarcasm detection model using CNN(Convolutional Neural Network)


In [None]:
!git clone https://github.com/mdt01/nlp_pract5.git

fatal: destination path 'nlp_pract5' already exists and is not an empty directory.


In [None]:
import pandas as pd
import numpy as np
import re
import gensim
import math
import nltk
import json

## Reading data

In [None]:
import json
def parse_data(file):
  for l in open(file,'r'):
    yield json.loads(l)

In [None]:
data = list(parse_data('/content/nlp_pract5/Sarcasm_Headlines_Dataset_v2.json'))
df = pd.DataFrame(data)

In [None]:
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [None]:
df['headline']

0        thirtysomething scientists unveil doomsday clo...
1        dem rep. totally nails why congress is falling...
2        eat your veggies: 9 deliciously different recipes
3        inclement weather prevents liar from getting t...
4        mother comes pretty close to using word 'strea...
                               ...                        
28614         jews to celebrate rosh hashasha or something
28615    internal affairs investigator disappointed con...
28616    the most beautiful acceptance speech this week...
28617    mars probe destroyed by orbiting spielberg-gat...
28618                   dad clarifies this not a food stop
Name: headline, Length: 28619, dtype: object

In [None]:
df.pop('article_link')

0        https://www.theonion.com/thirtysomething-scien...
1        https://www.huffingtonpost.com/entry/donna-edw...
2        https://www.huffingtonpost.com/entry/eat-your-...
3        https://local.theonion.com/inclement-weather-p...
4        https://www.theonion.com/mother-comes-pretty-c...
                               ...                        
28614    https://www.theonion.com/jews-to-celebrate-ros...
28615    https://local.theonion.com/internal-affairs-in...
28616    https://www.huffingtonpost.com/entry/andrew-ah...
28617    https://www.theonion.com/mars-probe-destroyed-...
28618    https://www.theonion.com/dad-clarifies-this-no...
Name: article_link, Length: 28619, dtype: object

In [None]:
df.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [None]:
len(df)

28619

In [None]:
df.shape

(28619, 2)

In [None]:
classes = np.unique(np.array(df['is_sarcastic']))
classes

array([0, 1])

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def text_clean(corpus):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)

    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process

    Output : Returns the cleaned text corpus

    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [None]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [None]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [None]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [None]:
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):

    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)

    Input :
    'corpus' - Text corpus on which pre-processing tasks will be performed

    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer

    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together

    Output : Returns the processed text corpus

    '''
    if cleaning == True:
        corpus = text_clean(corpus)

    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]

    if lemmatization == True:
        corpus = lemmatize(corpus)


    if stemming == True:
        corpus = stem(corpus, stem_type)

    corpus = [' '.join(x) for x in corpus]


    return corpus

In [None]:
headlines = preprocess(df['headline'], lemmatization = True, remove_stopwords = True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_c

In [None]:
headlines

['thirtysomething scientists unveil doomsday clock hair loss',
 'dem rep totally nail why congress fall short gender racial equality',
 'eat veggies deliciously different recipes',
 'inclement weather prevent liar get work',
 'mother come pretty close use word stream correctly',
 'white inheritance',
 'ways file tax less stress',
 'richard branson global warm donation nearly much cost fail balloon trip',
 'shadow government get large meet marriott conference room b',
 'lot parent know scenario',
 'lesbian consider father indiana amaze one',
 'amanda peet tell daughter sex special hug',
 'what know regard current treatments ebola',
 'chris christie suggest hillary clinton blame boko haram kidnap hundreds schoolgirls',
 'ford develop new suv run purely gasoline',
 'uber ceo travis kalanick step trump economic advisory council',
 'area boy enter jump touch top doorways phase',
 'area man travel gurney',
 'leave person disabilities behind',
 'lin manuel miranda would like remind put phone 

In [None]:
headlines[0:5]

['thirtysomething scientists unveil doomsday clock hair loss',
 'dem rep totally nail why congress fall short gender racial equality',
 'eat veggies deliciously different recipes',
 'inclement weather prevent liar get work',
 'mother come pretty close use word stream correctly']

In [None]:
import gensim
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import math
import keras
from keras.models import Sequential, Model
from keras import layers
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D
import h5py

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


## Loading word2vec Model

In [None]:
# kv = KeyedVectors.load_word2vec_format(datapath('/content/gdrive/MyDrive/Colab Notebooks/NLP/pr-7/GoogleNews-vectors-negative300.bin'),binary=True)

In [None]:
EMBEDDING_FILE = '/content/gdrive/MyDrive/Colab Notebooks/NLP/pr-7/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary = True)

## Defining model parameter

In [None]:
MAX_LENGTH = 10
VECTOR_SIZE = 300

##Data Vectorization and Standardization

In [None]:
def vectorize_data(data):
  vectors = []

  padding_vector = [0.0] * VECTOR_SIZE

  for i, data_point in enumerate(data):
    data_point_vectors = []
    count = 0

    tokens = data_point.split()

    for token in tokens:
      if count >= MAX_LENGTH:
        break

      if token in model.key_to_index:
        data_point_vectors.append(model[token])

      count = count + 1

    if len(data_point_vectors) < MAX_LENGTH:
      to_fill = MAX_LENGTH - len(data_point_vectors)

      for _ in range(to_fill):
        data_point_vectors.append(padding_vector)

    vectors.append(data_point_vectors)

  return vectors

In [None]:
vectorized_headlines = vectorize_data(headlines)

##Data Validation

In [None]:
for i, vec in enumerate(vectorized_headlines):
  if len(vec) != MAX_LENGTH:
    print(i)

In [None]:
len(vectorized_headlines[5])

10

In [None]:
len(vectorized_headlines)

28619

### Train Test Split and Conversion of Data Into Form excepted by Convolutional Neural Network

In [None]:
train_div = math.floor(0.9 * len(vectorized_headlines))
train_div

25757

In [None]:
X_train = vectorized_headlines[:train_div]
y_train = df['is_sarcastic'][:train_div]
X_test = vectorized_headlines[train_div:]
y_test = df['is_sarcastic'][train_div:]

print('The size of X_train is:', len(X_train), '\nThe size of y_train is:', len(y_train),
      '\nThe size of X_test is:', len(X_test), '\nThe size y_test is:', len(y_test))

The size of X_train is: 25757 
The size of y_train is: 25757 
The size of X_test is: 2862 
The size y_test is: 2862


In [None]:
X_train = np.reshape(X_train, (len(X_train), MAX_LENGTH, VECTOR_SIZE))
X_test = np.reshape(X_test, (len(X_test), MAX_LENGTH, VECTOR_SIZE))
y_train = np.array(y_train)
y_test = np.array(y_test)

### Defining Neural Network Model Parameters

In [None]:
FILTERS=8
KERNAL_SIZE=3
HIDDEN_LAYER_1_NODES=10
HIDDEN_LAYER_2_NODES=5
DROPOUT_PROB=0.35
NUM_EPOCHS=10
BATCH_SIZE=50

### Defining our CNN+FeedForward Neural Network for Detecting Sarcasm

In [None]:
model = Sequential()

model.add(Conv1D(FILTERS, KERNAL_SIZE, padding='same', strides=1, activation='relu', input_shape = (MAX_LENGTH, VECTOR_SIZE)))
model.add(GlobalMaxPooling1D())
model.add(Dense(HIDDEN_LAYER_1_NODES, activation='relu'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(HIDDEN_LAYER_2_NODES, activation='relu'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 10, 8)             7208      
                                                                 
 global_max_pooling1d (Globa  (None, 8)                0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                90        
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 5)                 55        
                                                                 
 dropout_1 (Dropout)         (None, 5)                 0         
                                                      

### Model building and training

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
training_history = model.fit(X_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Model Evaluation

In [None]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.7823
