Exploring Different Machine Learning Models

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('t_dataset.csv', index_col=False)

1. Long Short-Term Memory (LSTM) Networks:



In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5)

model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Hyperparameters (adjust as needed)
max_len = 100  # Maximum sequence length
vocab_size = 10000  # Limit on the number of words in vocabulary
embedding_dim = 128  # Dimensionality of word embeddings

# Load preprocessed data
texts = df['Tweet']  # List to store your text data (sarcastic and non-sarcastic)
labels = df['Sarcasm']  # List to store labels (1 for sarcastic, 0 for non-sarcastic)


# Tokenization
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Padding sequences to a fixed length
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define the Bidirectional LSTM model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=True)))  # Use Bidirectional LSTM
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model (replace with desired evaluation metrics)
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

# Make predictions on new data (optional)
new_text = "This new restaurant is a real gem, NOT."  # Replace with your text
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=max_len)
prediction = model.predict(padded_sequence)
if prediction > 0.5:
    print("This text is predicted to be sarcastic.")
else:
    print("This text is predicted to be non-sarcastic.")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.2059, Accuracy: 0.7521
This text is predicted to be non-sarcastic.


2. Gated Recurrent Units (GRUs):

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# Hyperparameters (adjust as needed)
max_len = 100  # Maximum sequence length
vocab_size = 10000  # Limit on the number of words in vocabulary
embedding_dim = 128  # Dimensionality of word embeddings

# Load preprocessed data (replace with your actual data)
texts = df['Tweet']  # List to store your text data (sarcastic and non-sarcastic)
labels = df['Sarcasm']  # List to store labels (1 for sarcastic, 0 for non-sarcastic)

# Tokenization
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Padding sequences to a fixed length
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define the GRU model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(GRU(64, return_sequences=True))  # GRU layer with return_sequences=True
model.add(Dropout(0.2))
model.add(GRU(32))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model (replace with desired evaluation metrics)
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

# Make predictions on new data (optional)
new_text = "This new restaurant is a real gem, NOT."  # Replace with your text
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=max_len)
prediction = model.predict(padded_sequence)
if prediction > 0.5:
    print("This text is predicted to be sarcastic.")
else:
    print("This text is predicted to be non-sarcastic.")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.4554, Accuracy: 0.7370
This text is predicted to be non-sarcastic.


 3)Convolutional Neural Networks (CNNs) with Gated Convolutions

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Hyperparameters (adjust as needed)
max_len = 100  # Maximum sequence length
vocab_size = 10000  # Limit on the number of words in vocabulary
embedding_dim = 128  # Dimensionality of word embeddings
filter_sizes = [3, 4, 5]  # Kernel window sizes for Gated Convolutions
num_filters = 64  # Number of filters in the convolutional layers

# Load preprocessed data (replace with your actual data)
texts = df['Tweet']  # List to store your text data (sarcastic and non-sarcastic)
labels = df['Sarcasm']  # List to store labels (1 for sarcastic, 0 for non-sarcastic)

# Tokenization
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Padding sequences to a fixed length
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Embedding layer (convert words to vectors)
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_len)

# Define the CNN model
model = Sequential()
model.add(embedding_layer)

# Gated Convolutional layers (with different kernel window sizes)
for filter_size in filter_sizes:
    conv_layer = Conv1D(num_filters, filter_size, activation='tanh', padding='same')
    gated_conv_layer = Conv1D(num_filters, filter_size, activation='sigmoid', padding='same')
    model.add(conv_layer)
    model.add(gated_conv_layer)
    model.add(Dropout(0.2))

# Global Max Pooling layer
model.add(GlobalMaxPooling1D())

# Dense layers for classification
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model (replace with desired evaluation metrics)
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

# Make predictions on new data (optional)
new_text = "This new restaurant is a real gem, NOT."  # Replace with your text
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=max_len)
prediction = model.predict(padded_sequence)
if prediction > 0.5:
    print("This text is predicted to be sarcastic.")
else:
    print("This text is predicted to be non-sarcastic.")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.8678, Accuracy: 0.7253
This text is predicted to be non-sarcastic.


RNN (Recurrent Neural Networks)

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping  # For early stopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

max_len = 100
vocab_size = 10000
embedding_dim = 128
texts = df['Tweet']

# Handle potential NaN values and ensure labels are within range
labels = df['Sarcasm'].map({'sarcasm': 1, 'non-sarcasm': 0}).fillna(0).astype(int)

# One-hot encode labels for categorical crossentropy
labels = to_categorical(labels, num_classes=2)  # 2 classes (sarcastic, non-sarcastic)

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define the RNN model using SimpleRNN
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(SimpleRNN(64, return_sequences=True))
model.add(Dropout(0.2))
model.add(SimpleRNN(32))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))  # Output layer with softmax for multiple classes

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

# Example prediction
new_text = "This new restaurant is a real gem, NOT."
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=max_len)
prediction = model.predict(padded_sequence)
predicted_class = prediction.argmax(axis=1)[0]

if predicted_class == 1:
    print("This text is predicted to be sarcastic.")
else:
    print("This text is predicted to be non-sarcastic.")


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 0.0000, Accuracy: 1.0000
This text is predicted to be non-sarcastic.
