In [None]:
import json
import nltk
import pandas as pd
import random

from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
def filter_stop_words(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words

In [None]:
def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

In [None]:
path='/content/gdrive/MyDrive/Sarcasm_Headlines_Dataset.json'
raw_data_json = pd.read_json(path, lines=True)

print("Raw data is :\n\n", raw_data_json.loc[:20,['is_sarcastic','headline']])

Raw data is :

     is_sarcastic                                           headline
0              1  thirtysomething scientists unveil doomsday clo...
1              0  dem rep. totally nails why congress is falling...
2              0  eat your veggies: 9 deliciously different recipes
3              1  inclement weather prevents liar from getting t...
4              1  mother comes pretty close to using word 'strea...
5              0                               my white inheritance
6              0         5 ways to file your taxes with less stress
7              1  richard branson's global-warming donation near...
8              1  shadow government getting too large to meet in...
9              0                 lots of parents know this scenario
10             0  this lesbian is considered a father in indiana...
11             0  amanda peet told her daughter sex is 'a specia...
12             0  what to know regarding current treatments for ...
13             0  chris christie

In [None]:
X = []
Y = []
special_characters = "!@#$%^&*()-_+=[]{}|:;\"'<>,.?/"

for index, elem in raw_data_json.iterrows():
  text = elem['headline']
  outcome = elem['is_sarcastic']

  x=[]
  str1 = ""
  for char in (text + ' '):
    if char == ' ' or char == '-' or char == '_':
      x.append(str1)
      str1 = ""
    elif char not in special_characters:
      str1 = str1 + char

  if len(x) > 0:
    filtered_x = filter_stop_words(x)
  else:
    continue
  if len(filtered_x) > 0:
    lemmatized_x = lemmatize_words(filtered_x)
    X.append(lemmatized_x)
    Y.append(outcome)


In [None]:
assert (len(X) == len(Y))

sarcastic_data = []
non_sarcastic_data = []

n_X = len(X)

for i in range(n_X):
  if Y[i] == 1:
    sarcastic_data.append(X[i])
  else:
    non_sarcastic_data.append(X[i])

print("Number of sarcastic samples : " + str(len(sarcastic_data)) + "\nNumber of non sarcastic samples : " + str(len(non_sarcastic_data)))


seed_value = 42
split_at = 9544 # decides number of sarcastic and non-sarcastic data in train set
n_sarcastic = len(sarcastic_data)
n_non_sarcastic = len(non_sarcastic_data)
random.seed(seed_value)



train_data = sarcastic_data[:split_at] + non_sarcastic_data[:split_at]
train_labels = ([1] * split_at) + ([0] * split_at)
test_data = (sarcastic_data[split_at:] + non_sarcastic_data[split_at:])
test_labels = ([1] * (n_sarcastic - split_at)) + ([0] * (n_non_sarcastic -split_at))

n_train = len(train_data)
n_test = len(test_data)

assert (len(train_data) == len(train_labels))
assert (len(test_data) == len(test_labels))

combined_train_data = list(zip(train_data, train_labels))
combined_test_data = list(zip(test_data, test_labels))
random.shuffle(combined_train_data)
random.shuffle(combined_test_data)

X_train, Y_train = zip(*combined_train_data) # Training set
X_test, Y_test = zip(*combined_test_data) # Test set

print("Length of training set is : ", n_train)
print("Length of test set is : ", n_test)

for i in range(100):
  print(X_train[i], Y_train[i])

Number of sarcastic samples : 13634
Number of non sarcastic samples : 14984
Length of training set is :  19088
Length of test set is :  9530
['christmas', 'pageant', 'enters', 'pre', 'production'] 1
['unidentified', 'yowling', 'animal', 'carrier', 'apparently', 'named', 'kiwi'] 1
['study', 'every', '10', 'second', 'skyscraper', 'window', 'washer', 'fall', 'death'] 1
['paranoid', 'oscar', 'pistorius', 'still', 'think', 'burglar'] 1
['new', 'war', 'enables', 'mankind', 'resolve', 'disagreement'] 1
['strange', 'looking', 'pup', 'find', 'perfect', 'family'] 0
['mlb', 'player', 'yordano', 'ventura', 'andy', 'marte', 'die', 'separate', 'car', 'crash'] 0
['science', 'behind', 'celebrity', 'like', 'ryan', 'lochte', 'tell', 'fib'] 0
['voice', 'recognition', 'software', 'yelled'] 1
['flooding', 'texas', 'lead', 'mosquito', 'borne', 'illness'] 0
['ebola', 'sierra', 'leone', 'reminded', 'conflict', 'zone'] 0
['podiatrist', 'recommend', 'getting', 'foot', 'rotated', 'every', '6', 'month'] 1
['new',