In [1]:
#Importing required libraries:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [2]:
#Loading Dataset
df= pd.read_csv('/Users/jashanjeetsingh/Downloads/train-balanced-sarcasm.csv')
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [3]:
#Removing NaN Values
df = df.dropna(subset=['comment'])

In [4]:
#used .loc to overcome SettingWithCopyWarning
def preprocess_text(text):
    return word_tokenize(text.lower())

df.loc[:, 'tokens'] = df['comment'].apply(preprocess_text)


In [5]:
#Implementing Word2Vec model:
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

In [6]:
# Converting each text sample to a fixed-size vector
def vectorize_text(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

df.loc[:, 'vector'] = df['tokens'].apply(lambda x: vectorize_text(x, word2vec_model))


In [7]:
# Preparing data for model training
X = np.vstack(df['vector'].values)
y = df['label']


In [8]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Training a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=10, random_state=42,n_jobs=-1)
rf_model.fit(X_train, y_train)


In [10]:
# Evaluating the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [12]:
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)

Accuracy: 0.6859
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.71      0.69    101016
           1       0.70      0.66      0.68    101139

    accuracy                           0.69    202155
   macro avg       0.69      0.69      0.69    202155
weighted avg       0.69      0.69      0.69    202155



In [14]:
#Therefore, Random Forest gives us an accuracy of 68.59%