In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pres"sing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json
/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json


In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# TOKENISATION - ENCODING WORDS TO NUMBERS
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog',
    'Dont you think my dog is amazing'
]
tokenizer = Tokenizer(num_words=100 , oov_token='<OOV>')
tokenizer.fit_on_texts(sentences) # most frequent 100 words
word_index = tokenizer.word_index
print(word_index)

# converting sentences to sequence of numbers 
test_sentence = ["Mahika's Dog is amazing","I wanna have a dog","My cat love dog , Doesnt it ?"]
sequences = tokenizer.texts_to_sequences(test_sentence)
print(sequences)

padded = pad_sequences(sequences)
print(padded)

# but if the new sentence contains a word not present in the 
# tokenizer , it will omit it and length of sequence will reduce

# to have correct length of sentences , we can use oov_token

# for using sentences of various length , we can use PADDING

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'dont': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[1, 4, 10, 11], [5, 1, 1, 1, 4], [2, 7, 3, 4, 1, 1]]
[[ 0  0  1  4 10 11]
 [ 0  5  1  1  1  4]
 [ 2  7  3  4  1  1]]


In [4]:
# LOADING DATA
import json

datastore = []
for line in open('/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json', 'r'):
    datastore.append(json.loads(line))

sentences = []
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

training_sentences = sentences[0:10000] # training size 10000
testing_sentences = sentences[10000:]
training_labels = labels[0:10000]
testing_labels = labels[10000:]

In [5]:
# TOKENISING DATA
tokenizer = Tokenizer(num_words=10000 , oov_token='<OOV>')
tokenizer.fit_on_texts(training_sentences) # most frequent 100 words

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences)

In [6]:
# need this block to work with tensorflow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [7]:
# Modeling into neural
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000,16,input_length=100),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(54,activation='relu'),
    tf.keras.layers.Dense(44,activation='relu'),
    tf.keras.layers.Dense(24,activation='relu'),
    tf.keras.layers.Dense(2,activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

2023-01-03 11:03:51.236856: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [8]:
# training data
history = model.fit(
    training_padded,training_labels,
    validation_data=(testing_padded,testing_labels),
    epochs=30,
)

2023-01-03 11:03:51.472408: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [9]:
# testing with new sentences
sentences= ["My name is amita Bansal"]

test_seq = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(test_seq)

print(model.predict(padded))

[[1.000000e+00 1.412738e-11]]
