In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import os

In [2]:
# Downloading the punkt tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hetvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Loading Dataset
df = pd.read_csv('train-balanced-sarcasm.csv')
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [4]:
# Removing NaN Values
df = df.dropna(subset=['comment'])

In [5]:
# Ensure all comments are strings
df['comment'] = df['comment'].astype(str)

In [6]:
# Tokenize text
def preprocess_text(text):
    return word_tokenize(text.lower())

df['tokens'] = df['comment'].apply(preprocess_text)

In [7]:
# Implementing Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

In [8]:
# Converting each text sample to a fixed-size vector
def vectorize_text(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [9]:
# Applying vectorization
df['vector'] = df['tokens'].apply(lambda x: vectorize_text(x, word2vec_model))

In [10]:
# Checking if vectorization was successful
if 'vector' not in df.columns:
    print("Vectorization failed.")
else:
    # Preparing data for model training
    X = np.vstack(df['vector'].values)
    y = df['label']

In [11]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Training a Naive Bayes classifier
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [13]:
# Evaluating the model
y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)

Accuracy: 0.5787
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.44      0.51    100894
           1       0.56      0.72      0.63    101261

    accuracy                           0.58    202155
   macro avg       0.58      0.58      0.57    202155
weighted avg       0.58      0.58      0.57    202155

