In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'news-headlines-dataset-for-sarcasm-detection:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F30764%2F533474%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240410%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240410T074355Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D0180fb9481deb22c5a33ecdd69543d19c59d23b093c7a8e9878e5384f8b5cfd366fde45c7676380c62f5d2bc86872fded80cf64b578ca82bf0e66e99fb298be55c595102ba95430fd176b63af7ebc8212dc3307ad13130e1670df585de4cc3e7af938f4978446e3ccf7419033aa3de93197f356ef162f2ad6d0efdef8435055c5f35300cef0f37bc7b65fbd63d1ae7883d16aa6e428e40312f67151162e0f0f676bb18fca77d38a190de25b607e2360887babdf67b708be6bfe4b256de95227087c05dcfa0aad1cc4939adfdb9680d9958af7fdfd69b15868406db3966247feafc7adf7a80a025cfe1262b93dec1ab36847475ed1c44547bbd43d9e522a09a95'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Main Library

In [None]:
# Reading Data
import json, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Preprocessing
import re, nltk, string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Building Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, GlobalAveragePooling1D
import tensorflow.keras as k

# Download Model
import pickle

# Reading Data

In [None]:
# with open(r"D:\Courses language programming\Natural Language Processing\Projects For NLP\Data\News Headlines Dataset For Sarcasm Detection\Sarcasm_Headlines_Dataset.json", 'r') as f:
#     datastore = json.load(f)

# sentence = []
# urls = []
# labels = []

# for item in datastore:
#     sentence.append(item["headline"])
#     urls.append(item["atricle_link"])
#     labels.append(item["is_sarcastic"])


# sentense[:5]

In [None]:
data = pd.read_json(r"D:\Courses language programming\Natural Language Processing\Projects For NLP\Data\News Headlines Dataset For Sarcasm Detection\Sarcasm_Headlines_Dataset.json", lines=True)
data.head()

In [None]:
data.drop(columns="article_link", axis=1, inplace=True)
data.head(5)

In [None]:
data["headline"][0]

In [None]:
data.info()

## Data Do not Have Null Value

# Preprocessing in Data

## Removing Stop Word And Punctuation And Make Lemmetization

## 1 - Stop Words
## 2 - Punctuation
## 3 - Lemmatization

In [None]:
punc = list(string.punctuation)
stop_words = stopwords.words("english")
lemma = WordNetLemmatizer()

In [None]:
def Process(data):
    data.lower()

    data = " ".join([lemma.lemmatize(word) for word in word_tokenize(data) if ((word not in punc) and (word not in stop_words))])

    data = re.sub("[^a-z]", " ", data)

    return data

In [None]:
data["headline"] = data["headline"].apply(Process)
data.head(5)

# To Categorical
## To Make Transform From 1output Label --> Len*Ouput (One Hot Encoder)

In [None]:
label = to_categorical(data["is_sarcastic"], 2)
label[:5]

# Make Tokenization

In [None]:
X = data["headline"]
Y = label
print(Y[:2])

In [None]:
tokenize = Tokenizer(oov_token="<oov>")
tokenize.fit_on_texts(X)
word_idx = tokenize.word_index

data_seqence = tokenize.texts_to_sequences(X)
pad_seq = pad_sequences(data_seqence, padding="pre", truncating="pre")

print("The Padding Sequance Shape is  --> ", pad_seq.shape)

In [None]:
input_length = max(len(seq) for seq in data_seqence)

vocabulary_size = len(word_idx) + 1

input_length, vocabulary_size

# Splitting Data To Training And Testing

In [None]:
x_train, x_test, y_train, y_test = train_test_split(pad_seq, label, train_size=0.7)

# Building Deep Learning Model

In [None]:
model = k.models.Sequential([
    Embedding(vocabulary_size, 50, input_length=input_length),
    GlobalAveragePooling1D(),
    Dense(48, activation="relu"),
    Dense(2, activation="softmax")
])

model.compile(optimizer="adam", loss=k.losses.BinaryCrossentropy(), metrics=["accuracy"])

In [None]:
history = model.fit(x_train, y_train, epochs=20, validation_data=(x_test, y_test), verbose=2)

In [None]:
plt.plot(history.history["loss"], label="Loss")
plt.plot(history.history["val_loss"], label="Val_Loss")

plt.xlabel("Epochs")
plt.ylabel("Loss")

plt.title("Loss Vs Epochs")

plt.legend()
plt.grid()

In [None]:
plt.plot(history.history["accuracy"], label="accuracy")
plt.plot(history.history["val_accuracy"], label="val_accuracy")

plt.xlabel("Epochs")
plt.ylabel("Accuracy")

plt.title("Accuracy Vs Epochs")

plt.legend()
plt.grid()

# System Prediction

In [None]:
text = word_tokenize(input())

new_text = ""
for word in text:
    if (word not in stop_words) and (word not in punc):
        new_text += lemma.lemmatize(word)
        new_text += " "

print(new_text)
test_sequace = tokenize.texts_to_sequences([new_text])
test_padding = pad_sequences(test_sequace, maxlen=31, padding="pre", truncating="pre")


# test_sequace
prediction = model.predict(test_padding)

print(prediction[0])
if np.argmax(prediction) == 0: print("This Massage is -->  is_sarcastic ")
else: print("This Massage is -->  not is_sarcastic ")

# Download Model

In [None]:
pickle.dump(model, open(r"D:\Pycharm\model_pickle\NLP - Models\\Sarcasm Detection.bin", "wb"))