In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Load dataset
file_path = 'sarcasm_training (1).csv' 

In [2]:
# Display the first few rows of the dataset after filtering meaningless words
print("First few rows of the dataset after filtering meaningless words:")
print(df.head())

# 1. One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse_output=False)
X_onehot = onehot_encoder.fit_transform(df['cleaned_text'].values.reshape(-1, 1))

# 2. Label Encoding
label_encoder = LabelEncoder()
X_label = label_encoder.fit_transform(df['cleaned_text'])

# 3. TF-IDF Encoding
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text']).toarray()

# 4. Word2Vec
sentences = df['cleaned_text'].apply(word_tokenize).tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Create a Word2Vec representation for each comment by averaging the word vectors
def get_word2vec_embedding(text):
    words = text.split()
    word_vecs = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(100)

X_word2vec = np.array([get_word2vec_embedding(text) for text in df['cleaned_text']])

# 5. Term Frequency Encoding
count_vectorizer = CountVectorizer(max_features=5000)
X_term_freq = count_vectorizer.fit_transform(df['cleaned_text']).toarray()

# Splitting Data
from sklearn.model_selection import train_test_split

X_train_onehot, X_test_onehot, y_train, y_test = train_test_split(X_onehot, df['labels'].values, test_size=0.2, random_state=42)
X_train_label, X_test_label, _, _ = train_test_split(X_label.reshape(-1, 1), df['labels'].values, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, df['labels'].values, test_size=0.2, random_state=42)
X_train_word2vec, X_test_word2vec, _, _ = train_test_split(X_word2vec, df['labels'].values, test_size=0.2, random_state=42)
X_train_term_freq, X_test_term_freq, _, _ = train_test_split(X_term_freq, df['labels'].values, test_size=0.2, random_state=42)

print("Training and testing sets prepared.")

# Print shapes of the encoded datasets to verify
print("One-Hot Encoded shape:", X_train_onehot.shape)
print("Label Encoded shape:", X_train_label.shape)
print("TF-IDF Encoded shape:", X_train_tfidf.shape)
print("Word2Vec Encoded shape:", X_train_word2vec.shape)
print("Term Frequency Encoded shape:", X_train_term_freq.shape)

First few rows of the dataset after filtering meaningless words:
                                                text  labels  \
0  <user> thanks for showing up for our appointme...       1   
1                                      haha .  # lol       1   
2  i love waiting <num> min for a cab - such shor...       1   
3  22 super funny quotes # funnyquotes  # funnysa...       1   
4            goog morning  # sorrynotsorry # morning       1   

                                        cleaned_text  
0              user thanks showing appointment today  
1                                           haha lol  
2  love waiting num min cab shortage user please ...  
3  super funny quotes funnyquotes funnysayings hi...  
4                      morning sorrynotsorry morning  
Training and testing sets prepared.
One-Hot Encoded shape: (15568, 19214)
Label Encoded shape: (15568, 1)
TF-IDF Encoded shape: (15568, 5000)
Word2Vec Encoded shape: (15568, 100)
Term Frequency Encoded shape: (15568, 500

In [10]:
pip install numpy pandas scikit-learn gensim tensorflow

Note: you may need to restart the kernel to use updated packages.


In [None]:
df = pd.read_csv('sarcasm_training (1).csv')

## One-Hot Encoding

One-Hot Encoding transforms each word in the text into a binary vector of length equal to the size of the vocabulary. Each word is represented by a vector where only the index corresponding to that word is set to 1, and all other indices are set to 0. This method is useful for categorical text data but can result in high-dimensional sparse matrices, which may require substantial memory for large vocabularies.

In [15]:
import numpy as np
from scipy.sparse import csr_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define a function to train and evaluate a Random Forest classifier
def train_rf(X_train, X_test, y_train, y_test):
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy: {accuracy * 100:.2f}%')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))

# Tokenize the text data with a limited vocabulary size
max_words = 1000  # Further limit vocabulary to the top 1000 words
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['cleaned_comment'])
sequences = tokenizer.texts_to_sequences(df['cleaned_comment'])

# Create a sparse one-hot encoded matrix
onehot_results_sparse = csr_matrix((len(sequences), max_words), dtype=np.float32)

for i, seq in enumerate(sequences):
    for word_index in seq:
        if word_index < max_words:
            onehot_results_sparse[i, word_index] = 1

# Split data for One Hot Encoding
X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(onehot_results_sparse, df['labels'], test_size=0.2, random_state=42)

# Convert sparse matrices to dense format before fitting the classifier (as RandomForestClassifier does not accept sparse matrices)
X_train_ohe = X_train_ohe.toarray()
X_test_ohe = X_test_ohe.toarray()

# Define and train the Random Forest classifier for One Hot Encoding
print("One Hot Encoding Results:")
train_rf(X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe)


  self._set_intXint(row, col, x.flat[0])


One Hot Encoding Results:
Test Accuracy: 71.37%
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.80      0.76      2210
           1       0.71      0.60      0.65      1754

    accuracy                           0.71      3964
   macro avg       0.71      0.70      0.70      3964
weighted avg       0.71      0.71      0.71      3964

Confusion Matrix:
[[1776  434]
 [ 701 1053]]


## TF-IDF Encoding

TF-IDF encoding combines Term Frequency (TF) with Inverse Document Frequency (IDF), which measures how unique or rare a term is across a collection of documents. This technique helps in highlighting words that are important to specific documents while down-weighting common terms that appear frequently across many documents. It balances the term's local importance with its global significance, making it a robust method for text representation in natural language processing.

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Split data for TF-IDF encoding
X_train_text, X_test_text, y_train, y_test = train_test_split(df['cleaned_comment'], df['labels'], test_size=0.2, random_state=42)

# TF-IDF Encoding
vectorizer_tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train_text)
X_test_tfidf = vectorizer_tfidf.transform(X_test_text)

# Define and train the Random Forest classifier for TF-IDF
def train_rf(X_train, X_test, y_train, y_test):
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy: {accuracy * 100:.2f}%')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))

# TF-IDF Encoding Results
print("TF-IDF Encoding Results:")
train_rf(X_train_tfidf, X_test_tfidf, y_train, y_test)


TF-IDF Encoding Results:
Test Accuracy: 72.75%
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.85      0.78      2210
           1       0.75      0.58      0.65      1754

    accuracy                           0.73      3964
   macro avg       0.73      0.71      0.71      3964
weighted avg       0.73      0.73      0.72      3964

Confusion Matrix:
[[1875  335]
 [ 745 1009]]


## Term Frequency (TF) Encoding
Term Frequency (TF) encoding represents the frequency of each word in a document. It captures how often a term appears in a document relative to the total number of words in that document. This method is simple and effective for understanding the distribution of terms within a text, but it does not account for the importance of terms across different documents.

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Split data for Term Frequency encoding
X_train_text, X_test_text, y_train, y_test = train_test_split(df['cleaned_comment'], df['labels'], test_size=0.2, random_state=42)

# Term Frequency Encoding
vectorizer_tf = CountVectorizer(max_features=5000)
X_train_tf = vectorizer_tf.fit_transform(X_train_text)
X_test_tf = vectorizer_tf.transform(X_test_text)

# Define and train the Random Forest classifier for Term Frequency
print("Term Frequency Encoding Results:")
train_rf(X_train_tf, X_test_tf, y_train, y_test)


Term Frequency Encoding Results:
Test Accuracy: 72.35%
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.85      0.77      2210
           1       0.75      0.56      0.64      1754

    accuracy                           0.72      3964
   macro avg       0.73      0.71      0.71      3964
weighted avg       0.73      0.72      0.72      3964

Confusion Matrix:
[[1881  329]
 [ 767  987]]


## Word2Vec Encoding
Word2Vec is a powerful word embedding technique that transforms words into continuous vector representations based on their context within a text corpus. This method enables the encoding of syntactic and semantic similarities, enhancing the performance of various natural language processing tasks.

In [11]:
from gensim.models import Word2Vec
import numpy as np

# Tokenize comments
df['tokenized_comment'] = df['cleaned_comment'].apply(lambda x: x.split())

# Train Word2Vec model
w2v_model = Word2Vec(sentences=df['tokenized_comment'], vector_size=100, window=5, min_count=1, workers=4)

# Helper function to get average Word2Vec embeddings
def get_avg_word2vec(tokens_list, model, vector_size):
    vec = np.zeros(vector_size).reshape((1, vector_size))
    count = 0
    for word in tokens_list:
        try:
            vec += model.wv[word].reshape((1, vector_size))
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

# Get Word2Vec embeddings for each comment
X_word2vec = np.concatenate([get_avg_word2vec(comment, w2v_model, 100) for comment in df['tokenized_comment']], axis=0)
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_word2vec, df['labels'], test_size=0.2, random_state=42)

# Define and train the Random Forest classifier for Word2Vec
print("Word2Vec Encoding Results:")
train_rf(X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v)


Word2Vec Encoding Results:
Test Accuracy: 65.11%
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.79      0.72      2210
           1       0.65      0.47      0.54      1754

    accuracy                           0.65      3964
   macro avg       0.65      0.63      0.63      3964
weighted avg       0.65      0.65      0.64      3964

Confusion Matrix:
[[1756  454]
 [ 929  825]]


### Optional(practice purposes only)
[Model Compilation and Training].

#### Tokenization: 
Converting text to sequences of integers.

#### Padding: 
Ensuring all sequences are of the same length.

#### Embedding Layer: 
Transforming words into dense vectors.

#### LSTM Layer: 
Capturing temporal patterns in the text.

#### Dropout Layer:
Preventing overfitting.

#### Model Compilation and Training:
Preparing and training the model for text classification.

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# Load dataset
df = pd.read_csv('sarcasm_training (1).csv')

# Verify column names
print(df.columns)

# Assuming 'cleaned_comment' exists, continue with tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])

# Pad sequences
max_sequence_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Encode labels
y = to_categorical(df['labels'])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=128,
                    input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))  # 2 classes: sarcasm or not

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=0.001),
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Example predictions (for demonstration purposes)
predictions = model.predict(X_test)


Index(['text', 'labels'], dtype='object')
Epoch 1/10




[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 84ms/step - accuracy: 0.6764 - loss: 0.5743 - val_accuracy: 0.8146 - val_loss: 0.4069
Epoch 2/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 81ms/step - accuracy: 0.9121 - loss: 0.2273 - val_accuracy: 0.8052 - val_loss: 0.4566
Epoch 3/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 81ms/step - accuracy: 0.9661 - loss: 0.0941 - val_accuracy: 0.7982 - val_loss: 0.5612
Epoch 4/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 84ms/step - accuracy: 0.9818 - loss: 0.0540 - val_accuracy: 0.7926 - val_loss: 0.7086
Epoch 5/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 84ms/step - accuracy: 0.9878 - loss: 0.0347 - val_accuracy: 0.7856 - val_loss: 0.9342
Epoch 6/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 95ms/step - accuracy: 0.9897 - loss: 0.0301 - val_accuracy: 0.7957 - val_loss: 0.9730
Epoch 7/10
[1m446/446[0m 