### Dataset From: https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection

In [1]:
# Importing Libraries
import pandas as pd
import re

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

#import nltk
#nltk.download('wordnet')

In [3]:
# Obtaining Dataset
data = pd.read_json('Sarcasm_Headlines_Dataset.json', lines = True)
print(data.head())

                                        article_link  \
0  https://www.huffingtonpost.com/entry/versace-b...   
1  https://www.huffingtonpost.com/entry/roseanne-...   
2  https://local.theonion.com/mom-starting-to-fea...   
3  https://politics.theonion.com/boehner-just-wan...   
4  https://www.huffingtonpost.com/entry/jk-rowlin...   

                                            headline  is_sarcastic  
0  former versace store clerk sues over secret 'b...             0  
1  the 'roseanne' revival catches up to our thorn...             0  
2  mom starting to fear son's web series closest ...             1  
3  boehner just wants wife to listen, not come up...             1  
4  j.k. rowling wishes snape happy birthday in th...             0  


In [3]:
sentences = data['headline']
labels = data['is_sarcastic']
print(sentences.shape)
print(labels.shape)

(26709,)
(26709,)


In [4]:
# Data Pre-Processing
print(sentences[13])
sentences = [re.sub('[^a-z]', ' ', sentence) for sentence in sentences]
print(sentences[13])

actually, cnn's jeffrey lord has been 'indefensible' for a while
actually  cnn s jeffrey lord has been  indefensible  for a while


In [5]:
print(sentences[:2])
sentences = [sentence.split() for sentence in sentences]
print(sentences[:2])
wnl = WordNetLemmatizer()
sentences = [' '.join([wnl.lemmatize(word) for word in sentence]) for sentence in sentences]
print(sentences[:2])

['former versace store clerk sues over secret  black code  for minority shoppers', 'the  roseanne  revival catches up to our thorny political mood  for better and worse']
[['former', 'versace', 'store', 'clerk', 'sues', 'over', 'secret', 'black', 'code', 'for', 'minority', 'shoppers'], ['the', 'roseanne', 'revival', 'catches', 'up', 'to', 'our', 'thorny', 'political', 'mood', 'for', 'better', 'and', 'worse']]
['former versace store clerk sue over secret black code for minority shopper', 'the roseanne revival catch up to our thorny political mood for better and worse']


In [6]:
# TF-IDF Vectorizer
tv = TfidfVectorizer(max_features = 5000)
sentences = tv.fit_transform(sentences).toarray()
print(sentences.shape)
print(tv.vocabulary_)

(26709, 5000)


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(sentences, labels, test_size = .05, random_state = 42)

In [9]:
# Model Training
lsvc = LinearSVC()
lsvc.fit(X_train, Y_train)
print("Training Score:", lsvc.score(X_train, Y_train)) # Training Score
print("Testing Score:", lsvc.score(X_test, Y_test)) # Testing Score

Training Score: 0.9129389508532693
Testing Score: 0.8330838323353293


In [11]:
# Testing Model
# https://www.theonion.com/coronavirus-assumed-white-house-would-be-bigger-in-pers-1843412682
headline = 'Coronavirus Assumed White House Would Be Bigger In Person'
headline = headline.lower()
headline = re.sub('[^a-z]', ' ', headline)
headline = headline.split()
headline = [wnl.lemmatize(word) for word in headline]
headline = [' '.join(headline)]
headline = tv.transform(headline).toarray()
predicted = lsvc.predict(headline)
if predicted == 1:
    print("Sarcasm")
else:
    print("Not Sarcasm")

Sarcasm
