In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
from sklearn.pipeline import Pipeline

In [32]:
# Downloading NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\haree\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [33]:
# Loading dataset
data = pd.read_csv('cleaned_balanced_dataset_FINAL.csv')

# Displaying first few rows of the dataset
data.head()


Unnamed: 0,label,comment
0,1,need
1,0,might well milk last
2,1,ask locktrap
3,1,im glad community doesnt make console player f...
4,0,joke put stitch


In [34]:
print("Columns in the dataset:", data.columns)

Columns in the dataset: Index(['label', 'comment'], dtype='object')


In [35]:
# Handling missing values and converting all entries to strings
data['comment'] = data['comment'].fillna('').astype(str)

In [36]:
def tokenize_text(text, tokenizer):
    if tokenizer == 'word':
        return word_tokenize(text)
    elif tokenizer == 'sentence':
        return sent_tokenize(text)
    elif tokenizer == 'tweet':
        tknzr = TweetTokenizer()
        return tknzr.tokenize(text)
    else:
        raise ValueError("Unknown tokenizer. Choose 'word', 'sentence', or 'tweet'.")

In [37]:
# Tokenizing text using word tokenizer
data['word_tokenized'] = data['comment'].apply(lambda x: tokenize_text(x, 'word'))
data.head()

Unnamed: 0,label,comment,word_tokenized
0,1,need,[need]
1,0,might well milk last,"[might, well, milk, last]"
2,1,ask locktrap,"[ask, locktrap]"
3,1,im glad community doesnt make console player f...,"[im, glad, community, doesnt, make, console, p..."
4,0,joke put stitch,"[joke, put, stitch]"


In [39]:
# Tokenizing text using sentence tokenizer
data['sentence_tokenized'] = data['comment'].apply(lambda x: tokenize_text(x, 'sentence'))
data.head()

Unnamed: 0,label,comment,word_tokenized,sentence_tokenized
0,1,need,[need],[need]
1,0,might well milk last,"[might, well, milk, last]",[might well milk last]
2,1,ask locktrap,"[ask, locktrap]",[ask locktrap]
3,1,im glad community doesnt make console player f...,"[im, glad, community, doesnt, make, console, p...",[im glad community doesnt make console player ...
4,0,joke put stitch,"[joke, put, stitch]",[joke put stitch]


In [40]:
# Tokenizing text using tweet tokenizer
data['tweet_tokenized'] = data['comment'].apply(lambda x: tokenize_text(x, 'tweet'))
data.head()

Unnamed: 0,label,comment,word_tokenized,sentence_tokenized,tweet_tokenized
0,1,need,[need],[need],[need]
1,0,might well milk last,"[might, well, milk, last]",[might well milk last],"[might, well, milk, last]"
2,1,ask locktrap,"[ask, locktrap]",[ask locktrap],"[ask, locktrap]"
3,1,im glad community doesnt make console player f...,"[im, glad, community, doesnt, make, console, p...",[im glad community doesnt make console player ...,"[im, glad, community, doesnt, make, console, p..."
4,0,joke put stitch,"[joke, put, stitch]",[joke put stitch],"[joke, put, stitch]"


In [41]:
# Splitting data into training and test sets
if 'comment' in data.columns and 'label' in data.columns:
    X_train, X_test, y_train, y_test = train_test_split(data['comment'], data['label'], test_size=0.2, random_state=42)
else:
    raise KeyError("The 'comment' or 'label' column is not present in the dataset.")

In [42]:
# Function to create and evaluate a Random Forest model
def evaluate_model(X_train, y_train, X_test, y_test, tokenizer):
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(tokenizer=lambda x: tokenize_text(x, tokenizer))),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"Results for {tokenizer} tokenizer:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [43]:
# Evaluating model using word tokenizer
evaluate_model(X_train, y_train, X_test, y_test, 'word')



Results for word tokenizer:
Accuracy: 0.636314696608475
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.67      0.65     13045
           1       0.65      0.60      0.62     12961

    accuracy                           0.64     26006
   macro avg       0.64      0.64      0.64     26006
weighted avg       0.64      0.64      0.64     26006

Confusion Matrix:
 [[8786 4259]
 [5199 7762]]


In [44]:
# Evaluating model using sentence tokenizer
evaluate_model(X_train, y_train, X_test, y_test, 'sentence')



Results for sentence tokenizer:
Accuracy: 0.5228408828731831
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.98      0.67     13045
           1       0.77      0.06      0.11     12961

    accuracy                           0.52     26006
   macro avg       0.64      0.52      0.39     26006
weighted avg       0.64      0.52      0.39     26006

Confusion Matrix:
 [[12810   235]
 [12174   787]]


In [45]:
# Evaluating model using tweet tokenizer
evaluate_model(X_train, y_train, X_test, y_test, 'tweet')



Results for tweet tokenizer:
Accuracy: 0.6351611166653849
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.67      0.65     13045
           1       0.65      0.60      0.62     12961

    accuracy                           0.64     26006
   macro avg       0.64      0.64      0.63     26006
weighted avg       0.64      0.64      0.63     26006

Confusion Matrix:
 [[8805 4240]
 [5248 7713]]


In [46]:
# Conclusion
print("Based on the evaluation, the word tokenizer performs the best for the sarcasm detection task using a Random Forest model.")

Based on the evaluation, the word tokenizer performs the best for the sarcasm detection task using a Random Forest model.


## Summary 
Imported Libraries: Loaded necessary libraries for data processing, text tokenization, and machine learning.

Loaded Dataset: Read a CSV file containing sarcastic comments and labels, and handled any missing values by converting all entries to strings.

Defined Tokenization Function: Created a function to tokenize text using different methods: word tokenization, sentence tokenization, and tweet tokenization.

Tokenized Text: Applied the tokenization function to the comments in the dataset, creating new columns for each type of tokenization.

Split Data: Divided the dataset into training and test sets with an 80-20 split.

Evaluated Model: Built and evaluated a Random Forest model for sarcasm detection using each type of tokenization, printing accuracy, classification reports, and confusion matrices.

Conclusion: Determined that the word tokenizer provided the best performance for the sarcasm detection task using the Random Forest model.

In [51]:
"""import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load dataset
data = pd.read_csv('cleaned_balanced_dataset_FINAL.csv')

# Preprocessing
data['processed_comment'] = data['comment'].str.lower().str.replace('[^\w\s]', '')
data['processed_comment_str'] = data['processed_comment'].astype(str)

# One Hot Encoding
onehot_vectorizer = CountVectorizer(binary=True)
X_onehot = onehot_vectorizer.fit_transform(data['processed_comment_str'])

# Term Frequency
tf_vectorizer = CountVectorizer()
X_tf = tf_vectorizer.fit_transform(data['processed_comment_str'])

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf_vectorizer.fit_transform(data['processed_comment_str'])

# Word2Vec Embeddings with TensorFlow
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_comment_str'])
word_index = tokenizer.word_index

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(data['processed_comment_str'])

# Pad sequences
maxlen = 100
X_padded = pad_sequences(sequences, maxlen=maxlen)

# Train Word2Vec-like embedding layer
embedding_dim = 100
embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(word_index) + 1,
    output_dim=embedding_dim
)

# Get Word2Vec embeddings
input_sequences = tf.convert_to_tensor(X_padded)
embeddings = embedding_layer(input_sequences)

# Average embeddings to get sentence vectors
X_word2vec = tf.reduce_mean(embeddings, axis=1).numpy()

# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])

# Train-test split function
def train_test_split_data(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
def train_evaluate_model(X_train, X_test, y_train, y_test):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return classification_report(y_test, y_pred, output_dict=True)

# Prepare data for model training
X_train_onehot, X_test_onehot, y_train, y_test = train_test_split_data(X_onehot, y)
X_train_tf, X_test_tf, y_train, y_test = train_test_split_data(X_tf, y)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split_data(X_tfidf, y)
X_train_w2v, X_test_w2v, y_train, y_test = train_test_split_data(X_word2vec, y)

# Evaluate models
performance_onehot = train_evaluate_model(X_train_onehot, X_test_onehot, y_train, y_test)
performance_tf = train_evaluate_model(X_train_tf, X_test_tf, y_train, y_test)
performance_tfidf = train_evaluate_model(X_train_tfidf, X_test_tfidf, y_train, y_test)
performance_w2v = train_evaluate_model(X_train_w2v, X_test_w2v, y_train, y_test)

# Print the performance comparison for each encoding method
print("Performance Comparison:")
print("One Hot Encoding Model Performance:\n", classification_report(y_test, model.predict(X_test_onehot)))
print("Term Frequency Model Performance:\n", classification_report(y_test, model.predict(X_test_tf)))
print("TF-IDF Model Performance:\n", classification_report(y_test, model.predict(X_test_tfidf)))
print("Word2Vec Model Performance:\n", classification_report(y_test, model.predict(X_test_w2v)))

# Select the best encoding method based on F1-score
performance = {
    "One Hot Encoding": performance_onehot['weighted avg']['f1-score'],
    "Term Frequency": performance_tf['weighted avg']['f1-score'],
    "TF-IDF": performance_tfidf['weighted avg']['f1-score'],
    "Word2Vec": performance_w2v['weighted avg']['f1-score']
}

best_encoding = max(performance, key=performance.get)

print(f"The best encoding method based on F1-score is: {best_encoding}")

# Compare performance of each encoding method
import matplotlib.pyplot as plt

labels = ['One Hot', 'Term Frequency', 'TF-IDF', 'Word2Vec']
f1_scores = [performance_onehot['weighted avg']['f1-score'],
             performance_tf['weighted avg']['f1-score'],
             performance_tfidf['weighted avg']['f1-score'],
             performance_w2v['weighted avg']['f1-score']]

plt.figure(figsize=(10, 6))
plt.bar(labels, f1_scores, color=['blue', 'green', 'red', 'purple'])
plt.xlabel('Encoding Method')
plt.ylabel('F1-score')
plt.title('Comparison of Encoding Methods for Sarcasm Detection')
plt.show()"""


  """import pandas as pd


'import pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import classification_report\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\nimport numpy as np\nimport tensorflow as tf\nfrom tensorflow.keras.preprocessing.text import Tokenizer\nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\n\n# Load dataset\ndata = pd.read_csv(\'cleaned_balanced_dataset_FINAL.csv\')\n\n# Preprocessing\ndata[\'processed_comment\'] = data[\'comment\'].str.lower().str.replace(\'[^\\w\\s]\', \'\')\ndata[\'processed_comment_str\'] = data[\'processed_comment\'].astype(str)\n\n# One Hot Encoding\nonehot_vectorizer = CountVectorizer(binary=True)\nX_onehot = onehot_vectorizer.fit_transform(data[\'processed_comment_str\'])\n\n# Term Frequency\ntf_vectorizer = CountVectorizer()\nX_tf = tf_vectorizer.fit_transform(data[\'processed_c