## Sarcasm Detection using Convolutional Neural Networks

### Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import re
import json
import gensim
import math
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import KeyedVectors
import keras 
from keras.models import Sequential, Model 
from keras import layers
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D
import h5py
from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Reading data

In [2]:
def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

In [3]:
data = list(parse_data(r"C:\Users\Dell\Downloads\Sarcasm_Headlines_Dataset_v2\Sarcasm_Headlines_Dataset_v2.json"))
df = pd.DataFrame(data)

### Basic Data Understanding

In [4]:
df.head(5)

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


### Sarcastic Headline

In [5]:
df['headline'][4]

"mother comes pretty close to using word 'streaming' correctly"

### Non-sarcastic Headline

In [6]:
df['headline'][1]

'dem rep. totally nails why congress is falling short on gender, racial equality'

In [7]:
df.pop('article_link')

0        https://www.theonion.com/thirtysomething-scien...
1        https://www.huffingtonpost.com/entry/donna-edw...
2        https://www.huffingtonpost.com/entry/eat-your-...
3        https://local.theonion.com/inclement-weather-p...
4        https://www.theonion.com/mother-comes-pretty-c...
                               ...                        
28614    https://www.theonion.com/jews-to-celebrate-ros...
28615    https://local.theonion.com/internal-affairs-in...
28616    https://www.huffingtonpost.com/entry/andrew-ah...
28617    https://www.theonion.com/mars-probe-destroyed-...
28618    https://www.theonion.com/dad-clarifies-this-no...
Name: article_link, Length: 28619, dtype: object

In [8]:
df.head(5)

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [9]:
len(df)

28619

In [10]:
classes = np.unique(np.array(df['is_sarcastic']))
classes

array([0, 1], dtype=int64)

### Data preprocessing

In [11]:
def text_clean(corpus):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [12]:
def stopwords_removal(corpus):
    stop = set(stopwords.words('english'))
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [13]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [14]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [15]:
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]
        

    return corpus

In [16]:
df['headline']

0        thirtysomething scientists unveil doomsday clo...
1        dem rep. totally nails why congress is falling...
2        eat your veggies: 9 deliciously different recipes
3        inclement weather prevents liar from getting t...
4        mother comes pretty close to using word 'strea...
                               ...                        
28614         jews to celebrate rosh hashasha or something
28615    internal affairs investigator disappointed con...
28616    the most beautiful acceptance speech this week...
28617    mars probe destroyed by orbiting spielberg-gat...
28618                   dad clarifies this not a food stop
Name: headline, Length: 28619, dtype: object

In [17]:
def text_clean(corpus):
    cleaned_corpus = []  
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]', repl=' ', string=word)  
            p1 = p1.lower()  # Convert to lowercase
            qs.append(p1)  # Append cleaned word to the list
        cleaned_corpus.append(' '.join(qs))  # Join words and append to the cleaned_corpus list
    return pd.Series(cleaned_corpus)  # Convert to a Pandas Series if needed
headlines = preprocess(df['headline'], lemmatization=True, remove_stopwords=True)


In [18]:
headlines[0:5]

['thirtysomething scientists unveil doomsday clock hair loss',
 'dem rep totally nail congress fall short gender racial equality',
 'eat veggies 9 deliciously different recipes',
 'inclement weather prevent liar get work',
 'mother come pretty close use word stream correctly']

### Loading Word2Vec Model

In [19]:
model = KeyedVectors.load_word2vec_format(r"C:\Users\Dell\Downloads\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin", binary=True)

### Defining model parameters

In [20]:
MAX_LENGTH = 10
VECTOR_SIZE = 300

### Data Vectorization and Standardization

In [21]:
def vectorize_data(data):
    
    vectors = []
    
    padding_vector = [0.0] * VECTOR_SIZE
    
    for i, data_point in enumerate(data):
        data_point_vectors = []
        count = 0
        
        tokens = data_point.split()
        
        for token in tokens:
            if count >= MAX_LENGTH:
                break
            if token in model.wv.vocab:
                data_point_vectors.append(model.wv[token])
            count = count + 1
        
        if len(data_point_vectors) < MAX_LENGTH:
            to_fill = MAX_LENGTH - len(data_point_vectors)
            for _ in range(to_fill):
                data_point_vectors.append(padding_vector)
        
        vectors.append(data_point_vectors)
        
    return vectors

In [26]:
def vectorize_data(data):
    vectors = []
    padding_vector = [0.0] * VECTOR_SIZE

    # Tokenize the sentences into words
    tokenized_sentences = [sentence.split() for sentence in data]

    # Train Word2Vec model (skip-gram model)
    model = Word2Vec(sentences=tokenized_sentences, vector_size=VECTOR_SIZE, window=5, min_count=1, workers=4)

    # Loop through each sentence
    for tokens in tokenized_sentences:
        data_point_vectors = []
        
        for count, token in enumerate(tokens):
            if count >= MAX_LENGTH:  # Limit the length to MAX_LENGTH
                break
            if token in model.wv:  # Check if the token is in the model vocabulary
                data_point_vectors.append(model.wv[token])
        
        # Add padding vectors if necessary
        while len(data_point_vectors) < MAX_LENGTH:
            data_point_vectors.append(padding_vector)
        
        vectors.append(data_point_vectors)
    
    # Convert to a NumPy array with a smaller data type to save memory
    return np.array(vectors, dtype=np.float32)

# Apply vectorization
vectorized_headlines = vectorize_data(headlines)

# Check the output shape
print(vectorized_headlines.shape)

(28619, 10, 300)


### Data Validation

In [27]:
for i, vec in enumerate(vectorized_headlines):
    if len(vec) != MAX_LENGTH:
        print(i)

In [28]:
len(vectorized_headlines[1])

10

In [29]:
len(vectorized_headlines)

28619

### Train Test Split and Conversion of Data Into Form expected by Convolutional Neural Network

In [30]:
train_div = math.floor(0.7 * len(vectorized_headlines))
train_div

20033

In [31]:
X_train = vectorized_headlines[:train_div]
y_train = df['is_sarcastic'][:train_div]
X_test = vectorized_headlines[train_div:]
y_test = df['is_sarcastic'][train_div:]

print('The size of X_train is:', len(X_train), '\nThe size of y_train is:', len(y_train),
      '\nThe size of X_test is:', len(X_test), '\nThe size of y_test is:', len(y_test))

The size of X_train is: 20033 
The size of y_train is: 20033 
The size of X_test is: 8586 
The size of y_test is: 8586


In [32]:
X_train = np.reshape(X_train, (len(X_train), MAX_LENGTH, VECTOR_SIZE))
X_test = np.reshape(X_test, (len(X_test), MAX_LENGTH, VECTOR_SIZE))
y_train = np.array(y_train)
y_test = np.array(y_test)

### Defining Neural Network Model Parameters

In [33]:
FILTERS=8
KERNEL_SIZE=3
HIDDEN_LAYER_1_NODES=10
HIDDEN_LAYER_2_NODES=5
DROPOUT_PROB=0.35
NUM_EPOCHS=10
BATCH_SIZE=50

### Defining our CNN+FeedForward Neural Network for Detecting Sarcasm

In [34]:
model = Sequential()

model.add(Conv1D(FILTERS,
                 KERNEL_SIZE,
                 padding='same',
                 strides=1,
                 activation='relu', 
                 input_shape = (MAX_LENGTH, VECTOR_SIZE)))
model.add(GlobalMaxPooling1D())
model.add(Dense(HIDDEN_LAYER_1_NODES, activation='relu'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(HIDDEN_LAYER_2_NODES, activation='relu'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None


### Model building and training

In [35]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
training_history = model.fit(X_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/10
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5192 - loss: 0.6913
Epoch 2/10
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5468 - loss: 0.6831
Epoch 3/10
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5599 - loss: 0.6776
Epoch 4/10
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5601 - loss: 0.6765
Epoch 5/10
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5851 - loss: 0.6662
Epoch 6/10
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6053 - loss: 0.6617
Epoch 7/10
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6100 - loss: 0.6580
Epoch 8/10
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6119 - loss: 0.6552
Epoch 9/10
[1m401/401[0m [32m━━━━━━━━

### Model Evaluation

In [37]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.6200


In [44]:
import os

output_dir = r"C:\Users\Dell\Downloads\Output Files"
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it does not exist

# Save the model structure
with open(os.path.join(output_dir, "sarcasm_detection_model_cnn.json"), "w") as json_file:
    json_file.write(model_structure)

# Save the model weights (note the corrected filename)
model.save_weights(os.path.join(output_dir, "sarcasm_detection_model_cnn.weights.h5"))

### Summary
## Libraries Used
1. Pandas: For data loading and manipulation.
2. NumPy: For numerical computations.
3. re: For regular expressions to clean text data.
4. TensorFlow and Keras: For building and training the CNN model.
5. NLTK (Natural Language Toolkit): For text preprocessing, including stop word removal and lemmatization.
6. Scikit-Learn: For splitting the dataset into training and testing sets.


## Procedure Followed
1. Data Loading and Exploration: The dataset is loaded into a DataFrame using Pandas, and a quick exploration is done to understand the data distribution and structure.
2. Text Preprocessing:
            Cleaning the text data by removing non-alphabetic characters, converting text to lowercase, and removing stop words using NLTK.
            Lemmatization is performed to reduce words to their base form.
3. Data Preparation:
            Splitting the dataset into training and testing sets using Scikit-Learn's train_test_split.
            Loading a pre-trained Word2Vec model for word embeddings.
4. Model Building:
            A Convolutional Neural Network (CNN) is built using Keras with layers such as Embedding, Convolutional, MaxPooling, and Dense layers.
            The model is compiled with the Adam optimizer and binary cross-entropy loss.
5. Model Training:
            The CNN model is trained using the training dataset for a specified number of epochs.
            Training progress is monitored for accuracy and loss.
6. Model Evaluation:
            The model is evaluated on the test dataset to determine its accuracy.
7. Model Saving:
The model architecture is saved in JSON format, and the model weights are saved in HDF5 format for future use.