In [32]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

### READING THE DATA FROM THE DATSET AND SPLITTING IT


In [33]:
# Reading the dataset
dataset = pd.read_csv("..\Headline_Dataset.csv")

# Extracting sentences and labels?
dataset = dataset.dropna()
dataset.isnull().sum()


sentence = dataset['Sentence']
label = dataset['is_Scarcastic']



### SPLITTING THE TEST AND TRAIN DATA

In [34]:
# Splitting train and test data
train_sentence, test_sentence, train_label, test_label = train_test_split(
    sentence, label, test_size=0.2, random_state=42
)

In [35]:
train_sentence.head()

24708         david cameron scottish people ill kill leave
5395     princeton student confront university presiden...
20900    nancy pelosi suggests donald trump get mental ...
3430     south korea working formally end korean war ye...
8733                                 fergusons easy answer
Name: Sentence, dtype: object

In [36]:
train_label.head()

24708    1
5395     0
20900    0
3430     0
8733     0
Name: is_Scarcastic, dtype: int64

### TOKENIZATION

In [37]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentence)
word_index = tokenizer.word_index


train_sequences = tokenizer.texts_to_sequences(train_sentence)
test_sequences = tokenizer.texts_to_sequences(test_sentence)



In [38]:
word_index

{'<OOV>': 1,
 'trump': 2,
 'new': 3,
 'man': 4,
 'woman': 5,
 'say': 6,
 'report': 7,
 'get': 8,
 'u': 9,
 'one': 10,
 'year': 11,
 'day': 12,
 'american': 13,
 'make': 14,
 'time': 15,
 'area': 16,
 'donald': 17,
 'like': 18,
 'people': 19,
 'life': 20,
 'nation': 21,
 'way': 22,
 'first': 23,
 'world': 24,
 'thing': 25,
 'obama': 26,
 'show': 27,
 'house': 28,
 'family': 29,
 'still': 30,
 'find': 31,
 'white': 32,
 'clinton': 33,
 'back': 34,
 'take': 35,
 'child': 36,
 'want': 37,
 'call': 38,
 'right': 39,
 'know': 40,
 'state': 41,
 'need': 42,
 'could': 43,
 'black': 44,
 'school': 45,
 'mom': 46,
 'kid': 47,
 'study': 48,
 'gop': 49,
 'go': 50,
 'bill': 51,
 'yearold': 52,
 'president': 53,
 'love': 54,
 'home': 55,
 'watch': 56,
 'death': 57,
 'friend': 58,
 'would': 59,
 'best': 60,
 'police': 61,
 'look': 62,
 'america': 63,
 'cant': 64,
 'parent': 65,
 'really': 66,
 'good': 67,
 'plan': 68,
 'star': 69,
 'student': 70,
 'change': 71,
 'going': 72,
 'may': 73,
 'last': 74,


In [39]:
train_sequences

[[541, 4460, 3527, 19, 1448, 171, 514],
 [3528, 70, 4461, 568, 53, 11078, 2322, 1449],
 [3208, 3931, 959, 17, 2, 8, 913, 90, 3529],
 [461, 339, 387, 7934, 141, 1693, 83, 1086, 1693, 83],
 [7935, 914, 1134],
 [146, 324, 1450, 6214, 3530, 1342, 6215, 355],
 [6216, 7936, 542, 294, 621, 3932, 3933],
 [5, 325, 240, 718, 689, 5182, 622, 333],
 [1343, 7937, 11079, 3934, 219],
 [6217, 1826, 1827, 879, 2323],
 [5183, 622, 179, 841, 3935, 299, 4462, 1389],
 [121, 109, 806, 842, 487, 960, 299, 137],
 [5, 294, 174, 284, 165, 11080, 11081],
 [11082, 499, 200, 1283, 5184, 2711, 1610, 569],
 [11083, 11084, 2934, 11085, 1828, 93, 74, 158],
 [2712, 22, 2935, 2169],
 [3936, 748, 2936, 7938, 270, 19, 11086],
 [1344, 807, 130, 11, 1135, 2499, 325, 447],
 [142, 1611, 267, 6, 2, 1612, 59, 1284, 9, 24, 1087, 1390],
 [11, 690, 32, 28, 2500, 961, 13, 3209, 11087],
 [7939, 100, 5185, 234, 1391, 2501, 994, 5186],
 [6218, 719, 1694, 7940, 1829],
 [543, 388, 42, 40, 434],
 [130, 2713, 195, 1451, 2170, 62, 462],
 [

### PADDING 


In [40]:
# Find the length of the longest sequence
max_len = max([len(seq) for seq in train_sequences])

# Pad sequences to the maximum length
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_len)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_len)

In [41]:
train_padded[:5]

array([[  541,  4460,  3527,    19,  1448,   171,   514,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [ 3528,    70,  4461,   568,    53, 11078,  2322,  1449,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [ 3208,  3931,   959,    17,     2,     8,   913,    90,  3529,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [  461,   339,   387,  7934,   141,  1693,    83,  1086,  1693,
           83,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [ 7935,   914,  1134,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,  

In [42]:
train_labels = train_label.values
test_labels = test_label.values

In [43]:
# Building the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_len),
    tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
])

# Compiling the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Summary of the model
model.summary()

# Training the model
history = model.fit(
    train_padded, train_labels,
    validation_data=(test_padded, test_labels),
    epochs=10,
    batch_size=32
)

# Evaluating the model
loss, accuracy = model.evaluate(test_padded, test_labels)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - accuracy: 0.6573 - loss: 0.6108 - val_accuracy: 0.7857 - val_loss: 0.4434
Epoch 2/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.8743 - loss: 0.3187 - val_accuracy: 0.8048 - val_loss: 0.4367
Epoch 3/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9285 - loss: 0.2036 - val_accuracy: 0.7971 - val_loss: 0.5160
Epoch 4/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9508 - loss: 0.1424 - val_accuracy: 0.7948 - val_loss: 0.6091
Epoch 5/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9640 - loss: 0.1054 - val_accuracy: 0.7924 - val_loss: 0.8136
Epoch 6/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.9766 - loss: 0.0685 - val_accuracy: 0.7930 - val_loss: 0.8036
Epoch 7/10
[1m668/668[0m [3

In [46]:
new_sentence = [ "nuclear bomb detonates during rehearsal for 'spider-man' musical"]
new_sequence = tokenizer.texts_to_sequences(new_sentence)
new_padded = pad_sequences(new_sequence, padding='post', maxlen=max_len)

prediction = model.predict(new_padded)[0][0]
print(f"Prediction: {'Sarcastic' if prediction > 0.5 else 'Not Sarcastic'} (Confidence: {prediction:.4f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Prediction: Not Sarcastic (Confidence: 0.0036)
