In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download NLTK data for tokenization
nltk.download('punkt')

# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

# Define the file path to the dataset in Google Drive
file_path = '/content/drive/My Drive/train-balanced-sarcasm.csv'

# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path)

# Reduce the dataset size to 1% for faster processing
df = df.sample(frac=0.01, random_state=42)

# Remove rows with NaN values in the 'comment' column
df = df.dropna(subset=['comment'])

# Ensure all comments are strings and handle NaN values
df['comment'] = df['comment'].astype(str)
df['comment'] = df['comment'].apply(lambda x: x if pd.notnull(x) else '')

# Preprocessing function to clean and tokenize text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and non-alphanumeric characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Join tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the 'comment' column
df['processed_comment'] = df['comment'].apply(preprocess_text)

# TF-IDF vectorization to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_comment']).toarray()

# Encode labels into binary values (0 or 1)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['label'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

# Reshape data to fit the input shape required by RNN
max_sequence_length = X_tfidf.shape[1]
X_train_reshaped = X_train.reshape((X_train.shape[0], max_sequence_length, 1))
X_test_reshaped = X_test.reshape((X_test.shape[0], max_sequence_length, 1))

# Build the RNN model
model = Sequential()
model.add(SimpleRNN(units=32, input_shape=(max_sequence_length, 1), return_sequences=False))  # RNN layer with 32 units
model.add(Dropout(0.5))  # Dropout layer to prevent overfitting
model.add(Dense(10, activation='relu'))  # Fully connected layer with ReLU activation
model.add(Dropout(0.5))  # Another dropout layer
model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid activation for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train, epochs=2, batch_size=32, validation_data=(X_test_reshaped, y_test))

# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f'Accuracy: {accuracy:.4f}')

# Generate a classification report
from sklearn.metrics import classification_report

y_pred = (model.predict(X_test_reshaped) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1/2
Epoch 2/2
Accuracy: 0.4763
              precision    recall  f1-score   support

           0       0.48      1.00      0.64       963
           1       0.50      0.00      0.01      1059

    accuracy                           0.48      2022
   macro avg       0.49      0.50      0.33      2022
weighted avg       0.49      0.48      0.31      2022

