In [None]:
%pip install pandas
%pip install stanfordnlp
%pip install senticnet
%pip install sentistrength
%pip install nltk
%pip install spacy
%pip install sklearn
%pip install numpy

# run this in the terminal
# python -m spacy download en_core_web_sm

In [1]:
import json
import pandas as pd
from stanfordcorenlp import StanfordCoreNLP
import requests
from senticnet.senticnet import SenticNet
from sentistrength import PySentiStr
import nltk
import spacy
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


In [2]:
nltk.download('punkt')
# stanfordNLP = StanfordCoreNLP("http://localhost", port=8000, timeout=30000)
spacyNLP = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# download the zip file from the link https://drive.google.com/file/d/1yvCpB2URy0iFjQPn3RmidNOryTlo6vHG/view?usp=share_link
# extract the zip file and place the folder in the same directory as this file then cd into the folder
# run the following command in the terminal to start the server
# java -mx4g -cp "*" edu.stanford.stanfordNLP.pipeline.StanfordCoreNLPServer -port {8000 or any port} -timeout 30000
# can speed it up by replace 4g with 8g (it represents the ram being used in gigs)
def lemmatize(text):
    # perform lemmatization
    lemmas = []
    output = stanfordNLP.annotate(text, properties={'annotators': 'tokenize,lemma', 'outputFormat': 'json'})
    output_dict = json.loads(output)
    tokens = output_dict['sentences'][0]['tokens']
    for token in tokens:
        lemmas.append(token['lemma'])
   
    return lemmas  

#### Converting the given JSON file into actual JSON format for easier readbility

In [None]:
writeFile = open("Sarcasm_Headlines.json", "w")
writeFile.write("{ \"headlines\": [")
with open("Sarcasm_Headlines_Dataset.json") as readFile:
  for item in readFile:
    writeFile.write(item + ",")
# removed the final comma manually
writeFile.write("]}")
readFile.close()
writeFile.close()

# Preprocessing Stage

#### Reading the dataset and removing all article links as our goal is to analyze the headlines for sarcasm

In [None]:
dataset = json.load(open("Sarcasm_Headlines.json"))
df = pd.DataFrame(dataset["headlines"])
df.drop(["article_link"], axis = 1, inplace = True)
df.head()

#### lemmatizing the dataset

In [None]:
def lemmatizeDataset():
    for index, row in df.iterrows():
        sentence = row['headline']
        row['headline'] = lemmatize(sentence)

lemmatizeDataset()
df.head()

#### writing to a csv file to avoid having to perform pre-processing again

In [None]:
df.to_csv('lemmatized.csv', index=False)

In [3]:
df = pd.read_csv("lemmatized.csv")
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


# Module 1 => Concept Level and Common Sense Knowledge
### ConceptNet
ConceptNet is a semantic network consisting of common-sense knowledge and concepts, represented<br> in the form of nodes (words or
short phrases) and labeled edges (relationships) between them.

In [5]:

# set the API endpoint and parameters
endpoint = 'http://api.conceptnet.io/c/en/'
params = {
    'filter': 'core',
    'limit': 1000
}
def conceptNet(sentence):
    # send a GET request to the API endpoint
    response = requests.get(endpoint + sentence, params=params)

    # parse the JSON response
    data = json.loads(response.text)
    edges = data['edges']
    edges.sort(key=lambda x: x['weight'], reverse=True)

    return edges

# Module 2 => Sentiment Score
### SentiStrength
SentiStrength is a sentiment lexicon that uses linguistic information and rules to detect<br>
sentiment strength in English text. SentiStrength provides positive and negative sentiment<br>
scores for each word. Both scores are integers from 1 to 5, where 1 signifies weak sentiment<br>
and 5 signifies strong sentiment.
<br>
polarity = positiveSentiment - negativeSentiment

### SenticNet
SenticNet is a resource for opinion mining that aims to create a collection of commonly<br> 
used common-sense concepts  with positive and negative sentiment scores. The sentiment <br>
score for each word is scaled from -1 to 1, where -1 signifies strongly negative sentiment,<br>
0 signifies neutral sentiment and 1 signifies strong positive sentiment.
<br> sentiment = score * 5 (in-order to keep it with sentiStrength)

### Rules of w_score (sentiment score) selection:
- if word belongs to SentiStrength || SenticNet => pick the score whichever exists
- if word belongs to SentiStrength && SenticNet => avg score of the lexicons
- else get the concepts from concept net to expand the meaning => select top 5 ranked and calculate the avg sentiment score

### Final Calculation
sum_pos_score = sum of all positive sentiment scores<br>
sum_neg_score = sum of all negative sentiment scores<br>
if sum_pos_score && sum_neg_score > 0, there is a contradiction in the sentence

In [6]:
sn = SenticNet()
def senticNetScore(word):
    try:
        polarityValue = sn.polarity_value(word)
        return float(polarityValue) * 5
    except KeyError:
        return None

In [7]:
senti = PySentiStr()
# got the jar file and data folder from the author (also reverse engineered the pysenti package to extract the jar file)
senti.setSentiStrengthPath('D:/Sarcasm_Detection-Feature_Selection/SentiStrengthCom.jar')
senti.setSentiStrengthLanguageFolderPath('D:/Sarcasm_Detection-Feature_Selection/SentStrength_Data')
def sentiStrengthScore(word):
    result = senti.getSentiment(word)
    return result

In [27]:
def wScore(word):
    senticNet = senticNetScore(word)
    sentiStrength = sentiStrengthScore(word)[0]
    if senticNet == None and sentiStrength == None:
        expansion = conceptNet(word)
        if len(expansion) == 0:
            return 0
        else:
            score = 0
            expansion = expansion[:5]
            for edge in expansion:
                score += wScore(edge['end']['label'])
            return score / 5
    elif senticNet == None:
        return sentiStrength
    elif sentiStrength == None:
        return senticNet
    else:
        return (senticNet + sentiStrength) / 2

In [9]:
def positiveScore(results):
    score = 0
    for result in results:
        if result > 0:
            score += result
    return score
def negativeScore(results):
    score = 0
    for result in results:
        if result < 0:
            score += result
    return score

# Module 3 => Sentence Coherence
Checking the coreference between subjects or objects of a sentence
<br> for two subjects w1 and w2, sentence is coherent if
- if w1 is antecedent of w2
- if w1 and w2 are identical pronouns
- if w1 and w2 are identical subjects
- w2 starts with the word "the" (Definite Noun Phrase)
- w2 starts with "this", "that", "these", "those" (Demonstrative Noun Phrases)
- if w1 and w2 are proper nouns

In [10]:
def extractSubject(sentence):
    doc = spacyNLP(sentence)
    subject = None
    for token in doc:
        if token.dep_ == "nsubj":
            subject = token.text
    return subject

In [11]:
def hasAntecedents(text):
    doc = spacyNLP(text)
    antecedents = []
    for token in doc:
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            for mention in doc.ents:
                if mention.start <= token.i < mention.end:
                    antecedents.append(mention.text)
    return True if len(antecedents) > 0 else False


In [12]:
pronounLemmatizer = WordNetLemmatizer()
def identicalPronouns(w1, w2):
    lemma1 = pronounLemmatizer.lemmatize(w1, 'n')
    lemma2 = pronounLemmatizer.lemmatize(w2, 'n')
    if lemma1 == lemma2:
        return True
    else:
        return False
    

In [13]:
def identicalSubjects(w1,w2):
    cleanedSubject1 = re.sub(r'[^a-zA-Z]', '', w1)
    cleanedSubject2 = re.sub(r'[^a-zA-Z]', '', w2)
    if cleanedSubject1 == cleanedSubject2:
        return True
    else:
        return False

In [14]:
def definiteNounPhraseFeature(text,w2):
    doc = nltk.word_tokenize(text)
    for i in range(len(doc)):
        if i-1 >= 0 and doc[i] == w2:
            if doc[i-1] == 'the':
                return True
    return False

In [15]:
def demonstrativeNounPhraseFeature(text,w2):
    doc = nltk.word_tokenize(text)
    for i in range(len(doc)):
        if doc[i] == w2:
            if i-1 >= 0 and doc[i-1] == 'this' or doc[i-1] == 'that' or doc[i-1] == 'these' or doc[i-1] == 'those':
                return True
    return False

In [16]:
def properNameFeature(w1,w2):
    taggedWords = nltk.pos_tag([w1,w2])
    proper = False
    for word, tag in taggedWords:
        if tag in ['NNP', 'NNPS']:
            proper = True
        else:
            proper = False
            break
    return proper

# Module 4 => Creation of Feature Vector

### N-Grams Features


In [17]:
# headlines = df.drop('is_sarcastic',axis='columns')
# sentences = df['headline']
def remove_symbols(line):
    return ''.join(ch for ch in line if ch.isalnum() or ch == " ")

# def createNgrams(sentence):
#     ngrams = []
#     sentence = remove_symbols(sentence)
#     tokens = nltk.word_tokenize(sentence)
#     bigrams = list(nltk.bigrams(tokens))
#     trigrams = list(nltk.trigrams(tokens))
#     return tokens, bigrams, trigrams

### Creating Feature Space

In [26]:
# def createFeatureSpaces():
#     uni = []
#     bi = []
#     tri = []
#     for sentence in sentences:
#         unigram, bigrams, trigrams = createNgrams(sentence)
#         for word in unigram:
#             uni.append(word)
#         for word in bigrams:
#             bi.append("{} {}".format(word[0], word[1]))
#         for word in trigrams:
#             tri.append("{} {} {}".format(word[0], word[1], word[2]))
#     uni = list(set(uni))
#     bi = list(set(bi))
#     tri = list(set(tri))
#     print("Length of unigrams: ",len(uni))
#     print("Length of bigrams: ",len(bi))
#     print("Length of trigrams: ",len(tri))
#     data = {col: np.zeros(len(df)) for col in uni}
#     print("Created Unigrams")
#     unigramSpace = pd.DataFrame(data)
#     print("Created Unigrams")
#     data = {col: np.zeros(len(df)) for col in bi}
#     print("Created Bigrams")
#     bigramSpace = pd.DataFrame(data)
#     print("Created Bigrams")
#     data = {col: np.zeros(len(df)) for col in tri}
#     print("Created Bigrams")
#     trigramSpace = pd.DataFrame(data)
#     print("Created Trigrams")
#     return unigramSpace, bigramSpace, trigramSpace
# # unigramSpace, bigramSpace, trigramSpace = createFeatureSpaces()
# vectorizer = CountVectorizer(ngram_range=(1,3))
# res = vectorizer.fit_transform(sentences)
# print(res.toarray())

MemoryError: Unable to allocate 74.9 GiB for an array with shape (26709, 376536) and data type int64

- Contradiction Feature: <br>
<emsp>We use two binary features Contra and Contra_Coher<br>
<emsp>Contra if headline has one sentence and contradiction in sentiment score occur
<br>
<emsp>Contra_Coher if headline has more than one sentence, contradiction of polarity and the headline is judged coherent<br>
- Sentiment Feature <br>
<emsp>Calculates the +ve and -ve score of the headline and then classify it as low/med/high
- Punctuation <br>
<emsp>We use 7 indicators<br><br>
    <emsp><emsp>1. Number of emoticons <br>
    <emsp><emsp>2. Number of repetitive sequence of punctuations<br>
    <emsp><emsp>3. Number of repetitive sequence of characters<br>
    <emsp><emsp>4. Number of capitalized word<br>
    <emsp><emsp>5. Number of slang and booster words<br>
    <emsp><emsp>6. Number of exclamation marks<br>
    <emsp><emsp>7. Number of idioms<br

In [29]:
df["CONTRA"] = np.zeros(len(df))
df["CONTRA_PLUS_COHER"] = np.zeros(len(df))
df["pos_low"] = np.zeros(len(df))
df["pos_med"] = np.zeros(len(df))
df["pos_high"] = np.zeros(len(df))
df["neg_low"] = np.zeros(len(df))
df["neg_med"] = np.zeros(len(df))
df["neg_high"] = np.zeros(len(df))


In [28]:
def calculate_scores(sentence):
    print("Sentence: ",sentence)
    score=[]
    results = []
    for word in nltk.word_tokenize(sentence):
        results.append(wScore(word))
    positiveSum = positiveScore(results)
    negativeSum = negativeScore(results)
    score.append(positiveSum)
    score.append(negativeSum)
    print("positiveScore: ",positiveSum)
    print("negativeScore: ",negativeSum)
    return score

def isContradiction(scores):
    if scores[0]!=0 and scores[1]!=0:
        return True
    return False

def checkCoherence(sentence):
    tokens = nltk.sent_tokenize(sentence)
    if len(tokens) > 1:
        if hasAntecedents(sentence):
            return True
        w1 = extractSubject(tokens[0])
        w2 = extractSubject(tokens[1])
        if identicalPronouns(w1,w2) or identicalSubjects(w1,w2) or definiteNounPhraseFeature(tokens[1],w2) or demonstrativeNounPhraseFeature(tokens[1],w2) or properNameFeature(w1,w2):
            return True   
    return False

def assignSentimentFeature(headline,scores):
    positiveScore = scores[0]
    negativeScore = scores[1]
    if positiveScore <= -1:
        df.loc[df["headline"] == headline, "pos_low"] = 1
    elif positiveScore >= 0 and positiveScore <= 1:
        df.loc[df["headline"] == headline, "pos_med"] = 1
    elif positiveScore >= 2:
        df.loc[df["headline"] == headline, "pos_high"] = 1
    if negativeScore >= 1:
        df.loc[df["headline"] == headline, "neg_low"] = 1
    elif negativeScore >= 0 and negativeScore <= 1:
        df.loc[df["headline"] == headline, "neg_med"] = 1
    elif negativeScore <= -2:
        df.loc[df["headline"] == headline, "neg_high"] = 1
def contradictionFeature():
    for headline in df["headline"]:
        headline = remove_symbols(headline)
        sentences = nltk.sent_tokenize(headline)
        scores = calculate_scores(headline)
        assignSentimentFeature(headline,scores)
        if len(sentences) > 1:
            print("CONTRA_PLUS_COHER")
            if isContradiction(scores) and checkCoherence(headline):
                df.loc[df["headline"] == headline, "CONTRA_PLUS_COHER"] = 1
            else:
                df.loc[df["headline"] == headline, "CONTRA_PLUS_COHER"] = 0
        else:
            print("CONTRA")
            if isContradiction(scores):
                df.loc[df["headline"] == headline, "CONTRA"] = 1
            else:
                df.loc[df["headline"] == headline, "CONTRA"] = 0
contradictionFeature()
print("success")


Sentence:  former versace store clerk sues over secret black code for minority shoppers
positiveScore:  2.1725000000000003
negativeScore:  -1.6
CONTRA
Sentence:  the roseanne revival catches up to our thorny political mood for better and worse
positiveScore:  7.704999999999999
negativeScore:  -8.399999999999999
CONTRA
Sentence:  mom starting to fear sons web series closest thing she will have to grandchild
positiveScore:  4.290000000000001
negativeScore:  -3.1500000000000004
CONTRA
Sentence:  boehner just wants wife to listen not come up with alternative debtreduction ideas
positiveScore:  4.2425
negativeScore:  0
CONTRA
Sentence:  jk rowling wishes snape happy birthday in the most magical way
positiveScore:  9.002500000000001
negativeScore:  0
CONTRA
Sentence:  advancing the worlds women
positiveScore:  0
negativeScore:  0
CONTRA
Sentence:  the fascinating case for eating labgrown meat
positiveScore:  5.905
negativeScore:  0
CONTRA
Sentence:  this ceo will send your kids to school if 

KeyboardInterrupt: 