Add all necessary imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import re
import nltk
import json
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer



Download nltk modules

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

Read data from file

In [3]:
df = pd.read_json('/content/Sarcasm_Headlines_Dataset.json', lines = True)
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


Output data

In [4]:
sentences = df['headline'].values
labels = df['is_sarcastic'].values

declare models

In [5]:
vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 3)
        )

classifier = RandomForestClassifier(
            n_estimators=100,
            random_state=42,
            class_weight={0: 1, 1: 2}  # Weight sarcastic cases more heavily
        )
lemmatizer = WordNetLemmatizer()
nltk_stopwords = set(stopwords.words('english'))
sia = SentimentIntensityAnalyzer()

In [6]:
# Extended sarcasm indicators with weights
sarcasm_patterns = {
            'intensifiers': {
                'absolutely': 0.6,
                'totally': 0.6,
                'completely': 0.6,
                'literally': 0.6,
                'obviously': 0.7,
                'clearly': 0.6,
                'exactly': 0.6
            },
            'phrases': {
                'yeah right': 0.8,
                'oh really': 0.7,
                'sure sure': 0.7,
                'oh great': 0.7,
                'just what i need': 0.8,
                'how wonderful': 0.7,
                'thanks a lot': 0.6,
                'perfect timing': 0.7,
                'story of my life': 0.6,
                'what a surprise': 0.6,
                'just perfect': 0.7,
                'cant wait': 0.5
            },
            'interjections': {
                'wow': 0.5,
                'yay': 0.5,
                'hurray': 0.5,
                'oh': 0.3,
                'ah': 0.3
            }
        }

calculate score

In [7]:
def calculate_sarcasm_score(text, original_text):
        """Calculate a weighted sarcasm score based on multiple indicators"""
        score = 0
        text_lower = text.lower()

        # Check for pattern matches
        for pattern_dict in sarcasm_patterns.values():
            for phrase, weight in pattern_dict.items():
                if phrase in text_lower:
                    score += weight

        # Check for uppercase words (excluding single letters)
        uppercase_words = [word for word in original_text.split()
                         if word.isupper() and len(word) > 1]
        score += len(uppercase_words) * 0.4

        # Check for repeated punctuation
        if re.search(r'(!{2,}|\?{2,}|\.{3,})', text):
            score += 0.5

        # Analyze sentiment
        sentiment = sia.polarity_scores(text)

        # Check for sentiment contrast (positive words in negative context or vice versa)
        if sentiment['pos'] > 0.5 and sentiment['neg'] > 0.2:
            score += 0.6

        # Check for extreme positive sentiment (often indicates sarcasm)
        if sentiment['pos'] > 0.8:
            score += 0.4

        # Additional points for common sarcastic patterns
        if '!' in text and any(word.isupper() for word in original_text.split()):
            score += 0.5

        if '...' in text:
            score += 0.3

        if text_lower.startswith(('oh', 'ah', 'wow')):
            score += 0.3

        return score

In [8]:
def predict(text):
        """Predict whether text is sarcastic using the weighted scoring system"""
        processed_text, original_text = preprocess_text(text)

        # Calculate sarcasm score
        sarcasm_score = calculate_sarcasm_score(processed_text, original_text)

        # Determine if sarcastic based on score threshold
        is_sarcastic = sarcasm_score >= 1.0  # Adjusted threshold
        confidence = min(sarcasm_score / 2, 0.99)  # Normalize confidence

        return {
            'is_sarcastic': is_sarcastic,
            'confidence': confidence,
            'sarcasm_score': sarcasm_score,
            'features': {
                'sentiment': sia.polarity_scores(processed_text),
                'uppercase_words': len([word for word in original_text.split()
                                     if word.isupper() and len(word) > 1]),
                'has_ellipsis': '...' in text,
                'has_exclamation': '!' in text,
                'pattern_matches': [pattern for pattern_dict in sarcasm_patterns.values()
                                  for pattern in pattern_dict.keys()
                                  if pattern in processed_text.lower()]
            }
        }

In [9]:
def preprocess_text(text):
        original_text = str(text)
        text = original_text.lower()
        text = re.sub(r'http\S+|www.\S+|@\w+', '', text)
        text = re.sub(r'[^a-zA-Z\s!?...]', ' ', text)
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(token) for token in tokens
                 if token not in nltk_stopwords]
        return ' '.join(tokens), original_text

In [10]:
test_texts = [
        "Oh great, another meeting. Just what I needed to make my day even better.",
        "Wow, I’ve always wanted to spend my weekend doing absolutely nothing productive.",
        "Oh, absolutely, I'm sure your expertise on this subject is unparalleled. Who needs actual facts?",
        #Non-Sarcastic Sentences
        "I’m looking forward to our meeting today, it should be productive.",
        "I have some exciting plans for the weekend, I can’t wait to get started.",
        "I think the idea has potential and could really succeed if we focus on the details."
]

print("Testing with example texts:")
for text in test_texts:
        result = predict(text)
        print(f"\nText: {text}")
        print(f"Is sarcastic: {result['is_sarcastic']}")
        # print(f"Confidence: {result['confidence']:.2f}")
        # print(f"Sarcasm score: {result['sarcasm_score']:.2f}")
        # print("Features found:", result['features']['pattern_matches'])
        # if result['features']['uppercase_words'] > 0:
        #     print(f"Uppercase words: {result['features']['uppercase_words']}")

Testing with example texts:

Text: Oh great, another meeting. Just what I needed to make my day even better.
Is sarcastic: True

Text: Wow, I’ve always wanted to spend my weekend doing absolutely nothing productive.
Is sarcastic: True

Text: Oh, absolutely, I'm sure your expertise on this subject is unparalleled. Who needs actual facts?
Is sarcastic: True

Text: I’m looking forward to our meeting today, it should be productive.
Is sarcastic: False

Text: I have some exciting plans for the weekend, I can’t wait to get started.
Is sarcastic: False

Text: I think the idea has potential and could really succeed if we focus on the details.
Is sarcastic: False
