Add all necessary imports

In [6]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
!pip install google-generativeai
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import google.generativeai as genai
import os
import re
import nltk
import json
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer





Download nltk modules

In [35]:
# used to get code recommendations in colab
os.environ['colab_genai']="AIzaSyA-S6QatpzHMXGXErBkXSGMvTAxaZI3rMU"
genai.configure(api_key=os.environ['colab_genai'])

In [9]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

Read data from file

In [10]:
df = pd.read_json('/content/Sarcasm_Headlines_Dataset_v2.json', lines = True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


Output data

In [11]:
sentences = df['headline'].values
labels = df['is_sarcastic'].values
print(len(labels))
print(sentences)

28619
['thirtysomething scientists unveil doomsday clock of hair loss'
 'dem rep. totally nails why congress is falling short on gender, racial equality'
 'eat your veggies: 9 deliciously different recipes' ...
 'the most beautiful acceptance speech this week came from a queer korean'
 'mars probe destroyed by orbiting spielberg-gates space palace'
 'dad clarifies this not a food stop']


In [12]:
print("Total number of non sarcastic sentences " + str(len(df[df['is_sarcastic'] == 0])))

Total number of non sarcastic sentences 14985


In [13]:
print("Total number of sarcastic sentences " + str(len(df[df['is_sarcastic'] == 1])))

Total number of sarcastic sentences 13634


Process data

In [14]:
punctuations = string.punctuation

In [15]:
processed_x = []
for sentence in sentences:
  new_sentence = ''
  for character in sentence:
    if character not in punctuations:
      character = character.lower()
      new_sentence += character
  processed_x.append(new_sentence)


Remove stop words

In [16]:
nltk_stopwords = set(stopwords.words('english'))
processed_X = []
for sentence in processed_x:
  new_sentence_processed = ''
  for word in sentence.split():
    if word not in nltk_stopwords:
      new_sentence_processed += word + ' '
  processed_X.append(new_sentence)

declare models

In [17]:
# Extended sarcasm indicators with weights
sarcasm_patterns = {
            'intensifiers': {
                'absolutely': 0.6,
                'totally': 0.6,
                'completely': 0.6,
                'literally': 0.6,
                'obviously': 0.7,
                'clearly': 0.6,
                'exactly': 0.6
            },
            'phrases': {
                'yeah right': 0.8,
                'oh really': 0.7,
                'sure sure': 0.7,
                'oh great': 0.7,
                'just what i need': 0.8,
                'how wonderful': 0.7,
                'thanks a lot': 0.6,
                'perfect timing': 0.7,
                'story of my life': 0.6,
                'what a surprise': 0.6,
                'just perfect': 0.7,
                'cant wait': 0.5
            },
            'interjections': {
                'wow': 0.5,
                'yay': 0.5,
                'hurray': 0.5,
                'oh': 0.3,
                'ah': 0.3
            }
        }

Bag of words

In [18]:
vectorizer = CountVectorizer(max_features = 1000)
vectorizer.fit(processed_X)

In [19]:
x_split = vectorizer.transform(processed_x)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x_split, labels)

In [21]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(21464, 6) (7155, 6) (21464,) (7155,)


In [23]:
classifier = RandomForestClassifier(
            n_estimators=500, random_state = 42
        )
classifier.fit(x_train, y_train)

In [33]:
def is_sarcastic(text):
    model = genai.GenerativeModel('gemini-pro')
    prompt = f"""Analyze the given text and determine whether it expresses sarcasm. Consider elements like tone, contradiction, and implied meaning. Respond with either "Sarcastic" or "Not Sarcastic."\n\nText: {text}"""
    response = model.generate_content(prompt)
    return response.text.split("\n")[0]

In [26]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, x_train, y_train, cv=10)
print("Cross-validation scores:", scores)

Cross-validation scores: [0.52678156 0.52631579 0.53097345 0.52165813 0.52423113 0.52562908
 0.52050326 0.52935694 0.53541473 0.52656104]


In [28]:
prediction = classifier.predict(x_test)

In [30]:
print("accuracy " + str(accuracy_score(y_test, prediction)))
print(confusion_matrix(y_test, prediction))

accuracy 0.5298392732354996
[[3643  129]
 [3235  148]]


In [31]:
def process_sentence(sentence, stopwords_set):
  processed_sentence = ''
  for character in sentence:
    if character not in string.punctuation:
      character = character.lower()
      processed_sentence += character
  new_sentence = ''
  for word in processed_sentence.split():
    if word not in stopwords_set:
      new_sentence += word + ' '

  return new_sentence

def predict_sentence(sentence, classifier, vectorizer, stopwords_set):
    processed_sentence = process_sentence(sentence, stopwords_set)
    vector_sentence = vectorizer.transform([processed_sentence])
    prediction = classifier.predict(vector_sentence)

    if is_sarcastic(sentence) == "Sarcastic":
      return "Sarcastic"
    else:
      return "Not Sarcastic"




In [36]:
test_texts = [
        "Oh great, another meeting. Just what I needed to make my day even better.",
        "Wow, I’ve always wanted to spend my weekend doing absolutely nothing productive.",
        "Oh, absolutely, I'm sure your expertise on this subject is unparalleled. Who needs actual facts?",
        #Non-Sarcastic Sentences
        "I’m looking forward to our meeting today, it should be productive.",
        "I have some exciting plans for the weekend, I can’t wait to get started.",
        "I think the idea has potential and could really succeed if we focus on the details."
]

In [41]:
for test_text in test_texts:
  result = predict_sentence(test_text, classifier, vectorizer, set(stopwords.words('english')))
  print(test_text)
  print(result)

Oh great, another meeting. Just what I needed to make my day even better.
Sarcastic
Wow, I’ve always wanted to spend my weekend doing absolutely nothing productive.
Sarcastic
Oh, absolutely, I'm sure your expertise on this subject is unparalleled. Who needs actual facts?
Sarcastic
I’m looking forward to our meeting today, it should be productive.
Not Sarcastic
I have some exciting plans for the weekend, I can’t wait to get started.
Not Sarcastic
I think the idea has potential and could really succeed if we focus on the details.
Not Sarcastic
