<a href="https://colab.research.google.com/gist/priyanshusharma16/8938e9de73a0a570030a80e1bddb89ed/label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import json
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import ReduceLROnPlateau

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [40]:
# Load data
data = pd.read_json('/content/Sarcasm_Headlines_Dataset.json', lines=True)

In [41]:
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


In [43]:
data.shape

(28619, 3)

In [44]:
#checking for null values in data
data.isnull().sum()

is_sarcastic    0
headline        0
article_link    0
dtype: int64

In [45]:
data.is_sarcastic.value_counts()

is_sarcastic
0    14985
1    13634
Name: count, dtype: int64

In [46]:
#checking for duplicate values
data['headline'].duplicated().sum()

116

In [47]:
# Drop duplicate headlines
data = data.drop(data[data['headline'].duplicated()].index, axis=0)

In [48]:
#rechecking for duplicate values
data['headline'].duplicated().sum()

0

In [49]:
# Drop unnecessary columns
data = data.drop(columns=['article_link'])

In [50]:
data.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [51]:
# Preprocessing functions
stop_words = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop_words.update(punctuation)

lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

def clean_text(text):
    words = text.split()
    words = [word.lower() for word in words]
    words = [re.sub(r'[^\w\s]', '', word) for word in words]
    words = [word for word in words if word.isalpha()]
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

data['clean_headline'] = data['headline'].apply(clean_text)


In [52]:
data.head()

Unnamed: 0,is_sarcastic,headline,clean_headline
0,1,thirtysomething scientists unveil doomsday clo...,thirtysometh scientist unveil doomsday clock h...
1,0,dem rep. totally nails why congress is falling...,dem rep total nail congress fall short gender ...
2,0,eat your veggies: 9 deliciously different recipes,eat veggi delici differ recip
3,1,inclement weather prevents liar from getting t...,inclement weather prevent liar get work
4,1,mother comes pretty close to using word 'strea...,mother come pretti close use word stream correct


In [53]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    data['clean_headline'], data['is_sarcastic'], test_size=0.20, random_state=42
)

In [54]:
# Label encoding the target variable
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

print("Label Encoding - Sample labels:")
print("Original:", train_labels[:5].values)
print("Encoded:", train_labels_encoded[:5])

Label Encoding - Sample labels:
Original: [1 0 0 1 0]
Encoded: [1 0 0 1 0]


In [55]:
# Tokenization and One-Hot Encoding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)
vocab_size = len(tokenizer.word_index) + 1

train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

max_length = max([len(seq) for seq in train_sequences])
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

print("One-Hot Encoding - Sample padded sequence:")
print(train_padded[0])


One-Hot Encoding - Sample padded sequence:
[   2  178 1320 3511 3240 4349 1657 1214  554  701   37  116    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [56]:
# Building the model using One-Hot Encoding
model_one_hot = Sequential()
model_one_hot.add(Embedding(vocab_size, 100, input_length=max_length))
model_one_hot.add(SpatialDropout1D(0.2))
model_one_hot.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_one_hot.add(Dense(1, activation='sigmoid'))

model_one_hot.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001)

history_one_hot = model_one_hot.fit(train_padded, train_labels_encoded, epochs=5, batch_size=64,
                                    validation_data=(test_padded, test_labels_encoded), callbacks=[reduce_lr])

# Evaluate the model
accuracy_one_hot = model_one_hot.evaluate(test_padded, test_labels_encoded)
print("One-Hot Encoding Model Accuracy:", accuracy_one_hot[1])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
One-Hot Encoding Model Accuracy: 0.5346430540084839
