https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection

# Import Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.options.display.max_columns = 3
pd.options.display.max_colwidth = -1

In [17]:
import pickle

# Import Data

In [3]:
dataset = pd.read_json('Data\\Dataset.json', lines=True)

In [4]:
dataset.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5,former versace store clerk sues over secret 'black code' for minority shoppers,0
1,https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365,"the 'roseanne' revival catches up to our thorny political mood, for better and worse",0
2,https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697,mom starting to fear son's web series closest thing she will have to grandchild,1
3,https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302,"boehner just wants wife to listen, not come up with alternative debt-reduction ideas",1
4,https://www.huffingtonpost.com/entry/jk-rowling-wishes-snape-happy-birthday_us_569117c4e4b0cad15e64fdcb,j.k. rowling wishes snape happy birthday in the most magical way,0


In [5]:
print('There are {} samples out of which {} {:.2f} are sarcastic'.format(len(dataset), sum(dataset['is_sarcastic']), 100*sum(dataset['is_sarcastic'])/len(dataset)))

There are 26709 samples out of which 11724 43.90 are sarcastic


# Preprocessing

In [12]:
import nltk, re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

remove_punc = lambda x : re.sub(r"\W", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

stop_words = set(nltk.corpus.stopwords.words('english'))
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

def cleanText(x, rsw, stm, lem, tgps):
    x = remove_punc(x)
    x = remove_extra_spaces(x)
    x = lower_case(x)
    if rsw:
        x = remove_stopwords(x)
    if stm:
        x = ps_stem(x)
    if lem:
        x = wnl_lemmatize(x)
    if tgps:
        x = tag_pos(x)
    return x

#### Cleaning

In [13]:
X = pd.DataFrame()
X['headline'] = dataset['headline'].apply(lambda x : cleanText(x, False, False, False, False))
X.head()

Unnamed: 0,headline
0,former versace store clerk sues over secret black code for minority shoppers
1,the roseanne revival catches up to our thorny political mood for better and worse
2,mom starting to fear son s web series closest thing she will have to grandchild
3,boehner just wants wife to listen not come up with alternative debt reduction ideas
4,j k rowling wishes snape happy birthday in the most magical way


#### Tagging

In [16]:
X_tagged = pd.DataFrame(columns=['headline'])

from IPython.display import clear_output
n = len(X)
for i in range(n):
    clear_output(wait=True)
    X_tagged = X_tagged.append({'headline':tag_pos(str(X.iloc[i][0]))}, ignore_index=True)
    print('Current Progress', i, '/', n)
X_tagged.head()

Current Progress 26708 / 26709


Unnamed: 0,headline
0,former(JJ) versace(NN) store(NN) clerk(NN) sues(NNS) over(IN) secret(JJ) black(JJ) code(NN) for(IN) minority(NN) shoppers(NNS)
1,the(DT) roseanne(NN) revival(NN) catches(VBZ) up(RP) to(TO) our(PRP$) thorny(JJ) political(JJ) mood(NN) for(IN) better(JJR) and(CC) worse(JJR)
2,mom(NN) starting(VBG) to(TO) fear(VB) son(NN) s(NN) web(NN) series(NN) closest(VBP) thing(NN) she(PRP) will(MD) have(VB) to(TO) grandchild(VB)
3,boehner(NN) just(RB) wants(VBZ) wife(NN) to(TO) listen(VB) not(RB) come(VBN) up(RP) with(IN) alternative(JJ) debt(NN) reduction(NN) ideas(NNS)
4,j(NN) k(NN) rowling(VBG) wishes(NNS) snape(VBP) happy(JJ) birthday(NN) in(IN) the(DT) most(RBS) magical(JJ) way(NN)


In [18]:
X_tagged.to_csv('Data\\tagged.csv', index=False)

#### Vectorizing

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [21]:
cVect = CountVectorizer()
cVect.fit(X_tagged)
pickle.dump(cVect, open('v1/cVect.pickle', 'wb'))

In [22]:
tVect = TfidfVectorizer()
tVect.fit(X_tagged)
pickle.dump(tVect, open('v1/tVect.pickle', 'wb'))