### One-Hot Encoding;

In [1]:
import pandas as pd
df=pd.read_csv("t_dataset.csv")
df_encoded = pd.get_dummies(df, columns=['Sarcasm'])

df_encoded.to_csv("onehot_dataset.csv", index=False)

### Random Forest Model;

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
tweets = df['Tweet']
y = df['Sarcasm'].apply(lambda x: 1 if x == 'yes' else 0)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(tweets)

if 'Category' in df.columns:
    encoder = OneHotEncoder(sparse=False)
    category_encoded = encoder.fit_transform(df[['Category']])
    category_df = pd.DataFrame(category_encoded, columns=encoder.get_feature_names_out(['Category']))
    X = pd.concat([pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()), category_df], axis=1)
else:
    X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

X.to_csv("one_hot_encoded_tweets.csv", index=False)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7956448911222781
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.65      0.73       250
           1       0.78      0.90      0.84       347

    accuracy                           0.80       597
   macro avg       0.80      0.78      0.78       597
weighted avg       0.80      0.80      0.79       597



### Term Frequency encoding;

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.tokenize import word_tokenize
tweets = df['Tweet']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(tweets)
tf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
final_df = pd.concat([df.drop('Tweet', axis=1), tf_df], axis=1)
final_df.to_csv("tfencoded_tweets.csv", index=False)



### Random Forest Model

In [10]:
y = df['Sarcasm'].apply(lambda x: 1 if x == 'yes' else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)  

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.695142378559464
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.53      0.59       250
           1       0.71      0.81      0.76       347

    accuracy                           0.70       597
   macro avg       0.69      0.67      0.67       597
weighted avg       0.69      0.70      0.69       597



### TF-IDF;

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import word_tokenize
tweets = df['Tweet']
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(tweets)
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
final_df = pd.concat([df.drop('Tweet', axis=1), tfidf_df], axis=1)
final_df.to_csv("tfidf_encoded_tweets.csv", index=False)

In [5]:
y = df['Sarcasm'].apply(lambda x: 1 if x == 'yes' else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)  

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7906197654941374
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.57      0.69       250
           1       0.75      0.95      0.84       347

    accuracy                           0.79       597
   macro avg       0.82      0.76      0.77       597
weighted avg       0.81      0.79      0.78       597



### Label Encoding;

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['Sarcasm_encoded'] = label_encoder.fit_transform(df['Sarcasm'])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label encoding mapping:", label_mapping)
df.to_csv("label_encoded_tweets.csv", index=False)

Label encoding mapping: {'no': 0, 'yes': 1}


In [12]:
from sklearn.preprocessing import LabelEncoder
tweets = df['Tweet']
y = df['Sarcasm'].apply(lambda x: 1 if x == 'yes' else 0)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(tweets)

if 'Category' in df.columns:
    label_encoder = LabelEncoder()
    df['Category_encoded'] = label_encoder.fit_transform(df['Category'])
    X = pd.concat([pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()), df[['Category_encoded']]], axis=1)
else:
    X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

X.to_csv("label_encoded_tweets.csv", index=False)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7956448911222781
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.65      0.73       250
           1       0.78      0.90      0.84       347

    accuracy                           0.80       597
   macro avg       0.80      0.78      0.78       597
weighted avg       0.80      0.80      0.79       597



### Word 2 Vec;

In [8]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
model = Word2Vec(sentences=df['word_tokens'], vector_size=100, window=5, min_count=1, workers=4)
import numpy as np
def tweet_vector(tweet, model):
    # Remove out-of-vocabulary words
    words = [word for word in tweet if word in model.wv.key_to_index]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    else:
        return np.zeros(model.vector_size)

df['tweet_vector'] = df['word_tokens'].apply(lambda x: tweet_vector(x, model))
vector_df = pd.DataFrame(df['tweet_vector'].tolist())
vector_df.to_csv("word2vec_vectors.csv", index=False)

### Random Forest Model;

In [9]:
df_vectors = pd.read_csv("word2vec_vectors.csv")

df_original = pd.read_csv("t_dataset.csv")

y = df_original['Sarcasm'].apply(lambda x: 1 if x == 'yes' else 0)
X = df_vectors.values  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.695142378559464
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.53      0.59       250
           1       0.71      0.81      0.76       347

    accuracy                           0.70       597
   macro avg       0.69      0.67      0.67       597
weighted avg       0.69      0.70      0.69       597

