In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
data = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines = True)
data

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [3]:
data = data.drop(['article_link'], axis = 1)
data

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...
...,...,...
28614,1,jews to celebrate rosh hashasha or something
28615,1,internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...
28617,1,mars probe destroyed by orbiting spielberg-gat...


In [4]:
data.shape
data.isnull().sum()

is_sarcastic    0
headline        0
dtype: int64

In [5]:
import os
def remove_punctuation(headline):
    '''a function for removing punctuation'''
    import string
    translator = str.maketrans('', '', string.punctuation)
    return headline.translate(translator)

data['headline'] = data['headline'].apply(remove_punctuation)
data.head(10)

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep totally nails why congress is falling ...
2,0,eat your veggies 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word stream...
5,0,my white inheritance
6,0,5 ways to file your taxes with less stress
7,1,richard bransons globalwarming donation nearly...
8,1,shadow government getting too large to meet in...
9,0,lots of parents know this scenario


In [6]:
data['headline'] = data['headline'].str.lower()

In [7]:
import nltk

In [8]:
def headline_token(row):
    headline = row['headline']
    token = nltk.word_tokenize(headline)
    token_words =[w for w in token if w.isalpha()]
    return token_words

data['headline_token'] = data.apply(headline_token, axis = 1)

In [9]:
data

Unnamed: 0,is_sarcastic,headline,headline_token
0,1,thirtysomething scientists unveil doomsday clo...,"[thirtysomething, scientists, unveil, doomsday..."
1,0,dem rep totally nails why congress is falling ...,"[dem, rep, totally, nails, why, congress, is, ..."
2,0,eat your veggies 9 deliciously different recipes,"[eat, your, veggies, deliciously, different, r..."
3,1,inclement weather prevents liar from getting t...,"[inclement, weather, prevents, liar, from, get..."
4,1,mother comes pretty close to using word stream...,"[mother, comes, pretty, close, to, using, word..."
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,"[jews, to, celebrate, rosh, hashasha, or, some..."
28615,1,internal affairs investigator disappointed con...,"[internal, affairs, investigator, disappointed..."
28616,0,the most beautiful acceptance speech this week...,"[the, most, beautiful, acceptance, speech, thi..."
28617,1,mars probe destroyed by orbiting spielberggate...,"[mars, probe, destroyed, by, orbiting, spielbe..."


In [10]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

In [11]:
def remove_stop(row):
    my_list = row['headline_token']
    meaningful_words = [w for w in my_list if not w in stops ]
    return (meaningful_words)

data['token_meaningful'] = data.apply(remove_stop, axis = 1)

In [12]:
data

Unnamed: 0,is_sarcastic,headline,headline_token,token_meaningful
0,1,thirtysomething scientists unveil doomsday clo...,"[thirtysomething, scientists, unveil, doomsday...","[thirtysomething, scientists, unveil, doomsday..."
1,0,dem rep totally nails why congress is falling ...,"[dem, rep, totally, nails, why, congress, is, ...","[dem, rep, totally, nails, congress, falling, ..."
2,0,eat your veggies 9 deliciously different recipes,"[eat, your, veggies, deliciously, different, r...","[eat, veggies, deliciously, different, recipes]"
3,1,inclement weather prevents liar from getting t...,"[inclement, weather, prevents, liar, from, get...","[inclement, weather, prevents, liar, getting, ..."
4,1,mother comes pretty close to using word stream...,"[mother, comes, pretty, close, to, using, word...","[mother, comes, pretty, close, using, word, st..."
...,...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,"[jews, to, celebrate, rosh, hashasha, or, some...","[jews, celebrate, rosh, hashasha, something]"
28615,1,internal affairs investigator disappointed con...,"[internal, affairs, investigator, disappointed...","[internal, affairs, investigator, disappointed..."
28616,0,the most beautiful acceptance speech this week...,"[the, most, beautiful, acceptance, speech, thi...","[beautiful, acceptance, speech, week, came, qu..."
28617,1,mars probe destroyed by orbiting spielberggate...,"[mars, probe, destroyed, by, orbiting, spielbe...","[mars, probe, destroyed, orbiting, spielbergga..."


In [13]:
def rejoin_words(row):
    my_list = row['token_meaningful']
    joined_words = ( " ".join(my_list))
    return joined_words

data['processed'] = data.apply(rejoin_words, axis=1)

In [14]:
data

Unnamed: 0,is_sarcastic,headline,headline_token,token_meaningful,processed
0,1,thirtysomething scientists unveil doomsday clo...,"[thirtysomething, scientists, unveil, doomsday...","[thirtysomething, scientists, unveil, doomsday...",thirtysomething scientists unveil doomsday clo...
1,0,dem rep totally nails why congress is falling ...,"[dem, rep, totally, nails, why, congress, is, ...","[dem, rep, totally, nails, congress, falling, ...",dem rep totally nails congress falling short g...
2,0,eat your veggies 9 deliciously different recipes,"[eat, your, veggies, deliciously, different, r...","[eat, veggies, deliciously, different, recipes]",eat veggies deliciously different recipes
3,1,inclement weather prevents liar from getting t...,"[inclement, weather, prevents, liar, from, get...","[inclement, weather, prevents, liar, getting, ...",inclement weather prevents liar getting work
4,1,mother comes pretty close to using word stream...,"[mother, comes, pretty, close, to, using, word...","[mother, comes, pretty, close, using, word, st...",mother comes pretty close using word streaming...
...,...,...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,"[jews, to, celebrate, rosh, hashasha, or, some...","[jews, celebrate, rosh, hashasha, something]",jews celebrate rosh hashasha something
28615,1,internal affairs investigator disappointed con...,"[internal, affairs, investigator, disappointed...","[internal, affairs, investigator, disappointed...",internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...,"[the, most, beautiful, acceptance, speech, thi...","[beautiful, acceptance, speech, week, came, qu...",beautiful acceptance speech week came queer ko...
28617,1,mars probe destroyed by orbiting spielberggate...,"[mars, probe, destroyed, by, orbiting, spielbe...","[mars, probe, destroyed, orbiting, spielbergga...",mars probe destroyed orbiting spielberggates s...


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
features = data['processed']
labels = data['is_sarcastic']

In [17]:
tv = TfidfVectorizer(max_features=5000)
features = list(features)
features = tv.fit_transform(features).toarray()

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.5)

In [21]:
from sklearn.svm import LinearSVC
model=LinearSVC()
model.fit(features_train, labels_train)
print("train score for lsvc",model.score(features_train, labels_train))
print("test score for lsvc",model.score(features_test, labels_test))

train score for lsvc 0.9127122789852541
test score for lsvc 0.7650593990216632
