In [None]:
import os
import numpy as np
import pandas as pd
import re, string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense, Bidirectional, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Load data
data = pd.read_json('/content/Sarcasm_Headlines_Dataset.json', lines=True)

In [None]:
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


In [None]:
data.shape

(28619, 3)

In [None]:
#checking for null values in train data
data.isnull().sum()

is_sarcastic    0
headline        0
article_link    0
dtype: int64

In [None]:
data.is_sarcastic.value_counts()

is_sarcastic
0    14985
1    13634
Name: count, dtype: int64

In [None]:
#checking for duplicate values
data['headline'].duplicated().sum()

116

In [None]:
# Drop duplicate headlines
data = data.drop(data[data['headline'].duplicated()].index, axis=0)

In [None]:
#rechecking for duplicate values
data['headline'].duplicated().sum()

0

In [None]:
# Drop unnecessary columns
data = data.drop(columns=['article_link'])

In [None]:
data.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [None]:
# Preprocessing functions
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def split_into_words(text):
    return text.split()

def to_lower_case(words):
    return [word.lower() for word in words]

def remove_punctuation(words):
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    return [re_punc.sub('', w) for w in words]

def keep_alphabetic(words):
    return [word for word in words if word.isalpha()]

def remove_stopwords(words):
    return [w for w in words if not w in stop]

def to_sentence(words):
    return ' '.join(words)

def denoise_text(text):
    words = split_into_words(text)
    words = to_lower_case(words)
    words = remove_punctuation(words)
    words = keep_alphabetic(words)
    words = remove_stopwords(words)
    return to_sentence(words)

In [None]:
# Apply text cleaning
data['news_headline'] = data['headline'].apply(denoise_text)

In [None]:
data.head()

Unnamed: 0,is_sarcastic,headline,news_headline
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,dem rep totally nails congress falling short g...
2,0,eat your veggies: 9 deliciously different recipes,eat veggies deliciously different recipes
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar getting work
4,1,mother comes pretty close to using word 'strea...,mother comes pretty close using word streaming...


In [None]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    data['news_headline'], data['is_sarcastic'], test_size=0.20, random_state=42
)

In [None]:
# Label encoding the target variable
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=12000)
train_tfidf = tfidf_vectorizer.fit_transform(train_data)
test_tfidf = tfidf_vectorizer.transform(test_data)

In [None]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(train_tfidf, train_labels_encoded)
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_labels_encoded, nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)

Naive Bayes Accuracy: 0.8014383441501491


In [None]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(train_tfidf, train_labels_encoded)
lr_predictions = lr_model.predict(test_tfidf)
lr_accuracy = accuracy_score(test_labels_encoded, lr_predictions)
print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.7988072268023154


In [None]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(train_tfidf, train_labels_encoded)
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_labels_encoded, rf_predictions)
print("Random Forest Classifier Accuracy:", rf_accuracy)

Random Forest Classifier Accuracy: 0.7742501315558674


In [None]:
# Decision Tree Classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(train_tfidf, train_labels_encoded)
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_labels_encoded, dt_predictions)
print("Decision Tree Classifier Accuracy:", dt_accuracy)

Decision Tree Classifier Accuracy: 0.7182950359586038


In [None]:
# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear')
svm_model.fit(train_tfidf, train_labels_encoded)
svm_predictions = svm_model.predict(test_tfidf)
svm_accuracy = accuracy_score(test_labels_encoded, svm_predictions)
print("Support Vector Machine Accuracy:", svm_accuracy)

Support Vector Machine Accuracy: 0.7982810033327487


In [None]:
# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100)
gb_model.fit(train_tfidf, train_labels_encoded)
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_labels_encoded, gb_predictions)
print("Gradient Boosting Classifier Accuracy:", gb_accuracy)

Gradient Boosting Classifier Accuracy: 0.6797053148570427
