### Loading the dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split


In [2]:
data_frame = pd.read_csv('/content/preprocessed_dataset.csv')

In [3]:
data_frame

Unnamed: 0,labels,cleaned_comment
0,1,user thanks showing appointment today
1,1,haha lol
2,1,love waiting num min cab shortage user please ...
3,1,22 super funny quote funnyquotes funnysayings ...
4,1,goog morning sorrynotsorry morning
...,...,...
19415,0,good read
19416,0,people living condition always remember blesse...
19417,0,winner seungyoon simple rt u save repost edit ...
19418,0,ok concept kravitz passing note magnus julia p...


### Partitioning the data into training and Validation

In [4]:
# Split the data into training and validation sets
train_df, val_df = train_test_split(data_frame, test_size=0.2, random_state=42)



In [5]:
# Reset the index for both DataFrames
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [6]:
train_df.head()

Unnamed: 0,labels,cleaned_comment
0,0,regret slightest
1,1,former homeowner send back got back great cust...
2,0,lmao made
3,0,black
4,0,end dog fighting


In [7]:
val_df.head()

Unnamed: 0,labels,cleaned_comment
0,0,took 2 year put gladly spend next 2 colombia l...
1,0,spent 10 min playing dog looked like happy hum...
2,1,download problem tag theamazinggag theamazinga...
3,1,truth toofunny willandgrace handersen 79
4,1,editing b bad il justify later life chat emoti...


In [8]:
#saving the validation dataset
val_df.to_csv('validation.csv')

In [9]:
df = train_df.copy()

In [10]:
df.head()

Unnamed: 0,labels,cleaned_comment
0,0,regret slightest
1,1,former homeowner send back got back great cust...
2,0,lmao made
3,0,black
4,0,end dog fighting



### Tokenization and Lemmatization:
**Tokenization:** Splitting text into individual words or tokens.





**Lemmatization:** Converting tokens to their base or root forms. For example, "running" becomes "run" after lemmatization.







In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [13]:

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [14]:

# Apply preprocessing to the comments
df['cleaned_comment'] = df['cleaned_comment'].apply(preprocess_text)

In [15]:
# Filter out rows where 'cleaned_comment' is empty
df = df[df['cleaned_comment'].str.strip() != '']


In [16]:

# Display the first few rows of the preprocessed dataset
df.head()


Unnamed: 0,labels,cleaned_comment
0,0,regret slightest
1,1,former homeowner send back got back great cust...
2,0,lmao made
3,0,black
4,0,end dog fighting


# TF-IDF

TF-IDF stands for Term Frequency-Inverse Document Frequency. It is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus). The TF-IDF score increases proportionally to the number of times a word appears in a document and is offset by the frequency of the word in the corpus.

### Formula:

**Term Frequency (TF):**

\[ \text{TF}(t, d) = \frac{\text{Number of times term } t \text{ appears in document } d}{\text{Total number of terms in document } d} \]

**Inverse Document Frequency (IDF):**

\[ \text{IDF}(t, D) = \log \left( \frac{\text{Total number of documents } D}{\text{Number of documents containing term } t} \right) \]

**TF-IDF:**

\[ \text{TF-IDF}(t, d, D) = \text{TF}(t, d) \times \text{IDF}(t, D) \]


In [17]:
# Define the text and target columns
text_column = 'cleaned_comment'
target_column = 'labels'

In [18]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split



# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df[text_column])
y = df[target_column]

In [19]:

# Apply SMOTE to the TF-IDF vectorized data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [20]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [21]:
import pickle
# Save the preprocessed data and vectorizer into a pickle file
with open('tfidf_smote_preprocessed_data.pkl', 'wb') as file:
    pickle.dump((tfidf_vectorizer, X_train, X_test, y_train, y_test), file)

In [22]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score

# Load the preprocessed data from the pickle file
with open('tfidf_smote_preprocessed_data.pkl', 'rb') as file:
    tfidf_vectorizer, X_train, X_test, y_train, y_test = pickle.load(file)

# Convert the sparse matrix to a dense matrix (if memory allows)
X_train = X_train.toarray()
X_test = X_test.toarray()



### Feedforward Neural Network (FNN) for Sarcasm Detection

This code snippet builds a Feedforward Neural Network (FNN) model using the Sequential API from TensorFlow's Keras library. The model is designed to perform binary classification for sarcasm detection in YouTube comments. The architecture consists of dense (fully connected) layers with ReLU activation functions and dropout layers to prevent overfitting. The final output layer uses a sigmoid activation function to produce a probability score for the binary classification task.

#### Model Architecture:
1. **Input Layer**:
    - Dense layer with 512 units and ReLU activation function.
    - Dropout layer with a 50% dropout rate.

2. **Hidden Layer**:
    - Dense layer with 256 units and ReLU activation function.
    - Dropout layer with a 50% dropout rate.

3. **Output Layer**:
    - Dense layer with 1 unit and sigmoid activation function for binary classification.

In [23]:
# Build the model
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))




In [24]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])



In [25]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Predict on test data
y_test_pred = (model.predict(X_test) > 0.5).astype("int32")

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred)
print(f"Test F1 Score: {f1}")

Test Loss: 1.0513650178909302
Test Accuracy: 0.7836158275604248
Test F1 Score: 0.7883977900552487


In [28]:
pip install keras-tuner


Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


## Training and Evaluating the Model with Best Hyperparameters

In [31]:
# Train the model with the best hyperparameters
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Predict on test data
y_test_pred = (model.predict(X_test) > 0.5).astype("int32")

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred)
print(f"Test F1 Score: {f1}")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 2.2209410667419434
Test Accuracy: 0.7935028076171875
Test F1 Score: 0.7884225759768451


## Hyperparameter Tuning for Feedforward Neural Network (FNN)
This code snippet defines and trains a Feedforward Neural Network (FNN) model using TensorFlow's Keras Sequential API for sarcasm detection in YouTube comments. Here's how you can perform hyperparameter tuning and optimize the model's performance:

In [34]:
# Build the model
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))


In [35]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


In [36]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Predict on test data
y_test_pred = (model.predict(X_test) > 0.5).astype("int32")

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred)
print(f"Test F1 Score: {f1}")



Test Loss: 1.1736729145050049
Test Accuracy: 0.7790960669517517
Test F1 Score: 0.7879609544468547


### Model Performance Summary

The Feedforward Neural Network (FNN) model developed for sarcasm detection in YouTube comments achieved the following results on the test dataset:

- **Accuracy**: 77.91%
- **F1 Score**: 0.78796

#### Insights:

- **Accuracy**: The model accurately identifies sarcasm in approximately 78% of cases, making it reliable for distinguishing between sarcastic and non-sarcastic comments.

- **F1 Score**: With an F1 score of 0.78796, the model demonstrates a balanced performance in precision and recall, essential for effective binary classification tasks.

#### Conclusion:

This FNN model serves as a solid foundation for sarcasm detection, providing a reasonable balance between accuracy and generalization. Further optimization through hyperparameter tuning and model refinement could potentially enhance its performance, ensuring robust detection capabilities in real-world applications.

