In [7]:
%pip install pandas
%pip install stanfordnlp
%pip install senticnet
%pip install sentistrength

In [51]:
import json
import pandas as pd
from stanfordcorenlp import StanfordCoreNLP
import requests
from senticnet.senticnet import SenticNet
from sentistrength import PySentiStr

In [9]:
# download the zip file from the link https://drive.google.com/file/d/1yvCpB2URy0iFjQPn3RmidNOryTlo6vHG/view?usp=share_link
# extract the zip file and place the folder in the same directory as this file then cd into the folder
# run the following command in the terminal to start the server
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {8000 or any port} -timeout 30000
# can speed it up by replace 4g with 8g (it represents the ram being used in gigs)
nlp = StanfordCoreNLP("http://localhost", port=8000, timeout=30000)
def lemmatize(text):
    # perform lemmatization
    lemmas = []
    output = nlp.annotate(text, properties={'annotators': 'tokenize,lemma', 'outputFormat': 'json'})
    output_dict = json.loads(output)
    tokens = output_dict['sentences'][0]['tokens']
    for token in tokens:
        lemmas.append(token['lemma'])
   
    return lemmas  

#### Converting the given JSON file into actual JSON format for easier readbility

In [4]:
writeFile = open("Sarcasm_Headlines.json", "w")
writeFile.write("{ \"headlines\": [")
with open("Sarcasm_Headlines_Dataset.json") as readFile:
  for item in readFile:
    writeFile.write(item + ",")
# removed the final comma manually
writeFile.write("]}")
readFile.close()
writeFile.close()

# Preprocessing Stage

#### Reading the dataset and removing all article links as our goal is to analyze the headlines for sarcasm

In [2]:
dataset = json.load(open("Sarcasm_Headlines.json"))
df = pd.DataFrame(dataset["headlines"])
df.drop(["article_link"], axis = 1, inplace = True)
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


#### lemmatizing the dataset

In [12]:
def lemmatizeDataset():
    for index, row in df.iterrows():
        sentence = row['headline']
        row['headline'] = lemmatize(sentence)

lemmatizeDataset()
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


#### writing to a csv file to avoid having to perform pre-processing again

In [13]:
df.to_csv('lemmatized.csv', index=False)

In [3]:
df = pd.read_csv("lemmatized.csv")
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


# Module 1 => Concept Level and Common Sense Knowledge

In [62]:

# set the API endpoint and parameters
endpoint = 'http://api.conceptnet.io/c/en/'
params = {
    'filter': 'core',
    'limit': 1000
}
def conceptNet(sentence):
    # send a GET request to the API endpoint
    response = requests.get(endpoint + sentence, params=params)

    # parse the JSON response
    data = json.loads(response.text)
    edges = data['edges']
    edges.sort(key=lambda x: x['weight'], reverse=True)

    return edges

# Module 2 => Sentiment Score
### SentiStrength
SentiStrength is a sentiment lexicon that uses linguistic information and rules to detect<br>
sentiment strength in English text. SentiStrength provides positive and negative sentiment<br>
scores for each word. Both scores are integers from 1 to 5, where 1 signifies weak sentiment<br>
and 5 signifies strong sentiment.
<br>
polarity = positiveSentiment - negativeSentiment

### SenticNet
SenticNet is a resource for opinion mining that aims to create a collection of commonly<br> 
used common-sense concepts  with positive and negative sentiment scores. The sentiment <br>
score for each word is scaled from -1 to 1, where -1 signifies strongly negative sentiment,<br>
0 signifies neutral sentiment and 1 signifies strong positive sentiment.
<br> sentiment = score * 5 (in-order to keep it with sentiStrength)

### Rules of w_score (sentiment score) selection:
- if word belongs to SentiStrength || SenticNet => pick the score whichever exists
- if word belongs to SentiStrength && SenticNet => avg score of the lexicons
- else get the concepts from concept net to expand the meaning => select top 5 ranked and calculate the avg sentiment score

### Final Calculation
sum_pos_score = sum of all positive sentiment scores<br>
sum_neg_score = sum of all negative sentiment scores<br>
if sum_pos_score && sum_neg_score > 0, there is a contradiction in the sentence

In [73]:
sn = SenticNet()
senti = PySentiStr()
senti.setSentiStrengthPath('D:/Sarcasm_Detection-Feature_Selection/SentiStrength.jar')
senti.setSentiStrengthLanguageFolderPath('D:/Sarcasm_Detection-Feature_Selection/SentStrength_Data/')
def senticNetScore(word):
    try:
        polarityValue = sn.polarity_value(word)
        return float(polarityValue) * 5
    except KeyError:
        return None
def sentiStrengthScore(word):
    result = senti.getSentiment(word)
    return result
def wScore(word):
    senticNet = senticNetScore(word)
    sentiStrength = sentiStrengthScore(word)[0]
    if senticNet == None and sentiStrength == None:
        expansion = conceptNet(word)
        if len(expansion) == 0:
            return 0
        else:
            score = 0
            expansion = expansion[:5]
            for edge in expansion:
                score += wScore(edge['end']['label'])
            return score / 5
    elif senticNet == None:
        return sentiStrength
    elif sentiStrength == None:
        return senticNet
    else:
        return (senticNet + sentiStrength) / 2
def positiveScore(results):
    score = 0
    for result in results:
        if result > 0:
            score += result
    return score
def negativeScore(results):
    score = 0
    for result in results:
        if result < 0:
            score += result
    return score

Positive Score: 1.9700000000000002
Negative Score: -1.6
Contrast Score: 0.3700000000000001
