In [24]:
import os
import numpy as np
import pandas as pd
import re, string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM,GRU, Dense, Bidirectional, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load data
data = pd.read_json('/content/Sarcasm_Headlines_Dataset.json', lines=True)

In [3]:
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


In [5]:
data.shape

(28619, 3)

In [6]:
#checking for null values in train data
data.isnull().sum()

is_sarcastic    0
headline        0
article_link    0
dtype: int64

In [7]:
data.is_sarcastic.value_counts()

is_sarcastic
0    14985
1    13634
Name: count, dtype: int64

In [8]:
#checking for duplicate values
data['headline'].duplicated().sum()

116

In [9]:
# Drop duplicate headlines
data = data.drop(data[data['headline'].duplicated()].index, axis=0)

In [10]:
#rechecking for duplicate values
data['headline'].duplicated().sum()

0

In [11]:
# Drop unnecessary columns
data = data.drop(columns=['article_link'])

In [12]:
data.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [13]:
# Preprocessing functions
stop = set(stopwords.words('english'))     #creates a set of common English stopwords that don't add significant meaning to sentences
punctuation = list(string.punctuation)     #creates a list of punctuation characters.
stop.update(punctuation)                   #adds punctuation to the stopwords set, so both stopwords and punctuation can be removed together

def split_into_words(text):
    return text.split()

def to_lower_case(words):
    return [word.lower() for word in words]

def remove_punctuation(words):
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    return [re_punc.sub('', w) for w in words]

def keep_alphabetic(words):                #Filters the list to keep only alphabetic words, removing any tokens that contain numbers or special characters
    return [word for word in words if word.isalpha()]

def remove_stopwords(words):
    return [w for w in words if not w in stop]

def to_sentence(words):                    #Converts a list of words back into a single sentence
    return ' '.join(words)

def denoise_text(text):                    #Removes punctuation, stopwords, and converts text to lowercase
    words = split_into_words(text)
    words = to_lower_case(words)
    words = remove_punctuation(words)
    words = keep_alphabetic(words)
    words = remove_stopwords(words)
    return to_sentence(words)

In [14]:
# Apply text cleaning
data['news_headline'] = data['headline'].apply(denoise_text)

In [15]:
data.head()

Unnamed: 0,is_sarcastic,headline,news_headline
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,dem rep totally nails congress falling short g...
2,0,eat your veggies: 9 deliciously different recipes,eat veggies deliciously different recipes
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar getting work
4,1,mother comes pretty close to using word 'strea...,mother comes pretty close using word streaming...


In [16]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    data['news_headline'], data['is_sarcastic'], test_size=0.20, random_state=42
)

In [17]:
# Label encoding the target variable
label_encoder = LabelEncoder()                   #convert categorical labels into numerical form.
train_labels_encoded = label_encoder.fit_transform(train_labels)     # fitting and transforming training labels
test_labels_encoded = label_encoder.transform(test_labels)           # transforming testing labels

In [18]:
# Tokenization
tokenizer = Tokenizer()    #Creates an instance of the Tokenizer class from TensorFlow Keras.
tokenizer.fit_on_texts(train_data)
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 25734


In [19]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

In [20]:
# Padding sequences
max_length = max([len(x) for x in train_sequences])  # Find the maximum sequence length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [25]:
# GRU Model
gru_model = Sequential()
gru_model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length))
gru_model.add(SpatialDropout1D(0.2))
gru_model.add(GRU(100, dropout=0.2, recurrent_dropout=0.2))
gru_model.add(Dense(1, activation='sigmoid'))

gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(gru_model.summary())

# Train the model
history_gru = gru_model.fit(
    train_padded, train_labels_encoded,
    epochs=5, batch_size=64, validation_data=(test_padded, test_labels_encoded),
    callbacks=[ReduceLROnPlateau(monitor='val_loss', patience=3, cooldown=0)]
)

# Evaluate the model
gru_predictions = (gru_model.predict(test_padded) > 0.5).astype("int32")
print("GRU Model Accuracy:", accuracy_score(test_labels_encoded, gru_predictions))
print(confusion_matrix(test_labels_encoded, gru_predictions))
print(classification_report(test_labels_encoded, gru_predictions))

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 106, 100)          2573400   
                                                                 
 spatial_dropout1d_3 (Spati  (None, 106, 100)          0         
 alDropout1D)                                                    
                                                                 
 gru (GRU)                   (None, 100)               60600     
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 2634101 (10.05 MB)
Trainable params: 2634101 (10.05 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
GRU Model Accuracy: 0.534

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
