# Random Forest model

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
data = pd.read_excel('tokenized Dataset.xlsx')

# Define the feature and target columns
X = ['word_token', 'sent_token', 'subword_token', 'char_data']
Y = 'Sarcasm'
data

Unnamed: 0,cleaned_data,word_token,sent_token,subword_token,char_data,Sentiment,Sarcasm
0,One reviewer mention watch 1 Oz episode youll ...,"['One', 'reviewer', 'mention', 'watch', '1', '...",['One reviewer mention watch 1 Oz episode youl...,"['One', 'Ġreviewer', 'Ġmention', 'Ġwatch', 'Ġ1...","['O', 'n', 'e', ' ', 'r', 'e', 'v', 'i', 'e', ...",positive,not sarcastic
1,wonderful little production. filming technique...,"['wonderful', 'little', 'production', '.', 'fi...","['wonderful little production.', 'filming tech...","['w', 'onder', 'ful', 'Ġlittle', 'Ġproduction'...","['w', 'o', 'n', 'd', 'e', 'r', 'f', 'u', 'l', ...",positive,not sarcastic
2,movie groundbreaking experience! Ive never see...,"['movie', 'groundbreaking', 'experience', '!',...","['movie groundbreaking experience!', 'Ive neve...","['movie', 'Ġground', 'breaking', 'Ġexperience'...","['m', 'o', 'v', 'i', 'e', ' ', 'g', 'r', 'o', ...",positive,sarcastic
3,think wonderful way spend time hot summer week...,"['think', 'wonderful', 'way', 'spend', 'time',...",['think wonderful way spend time hot summer we...,"['think', 'Ġwonderful', 'Ġway', 'Ġspend', 'Ġti...","['t', 'h', 'i', 'n', 'k', ' ', 'w', 'o', 'n', ...",positive,not sarcastic
4,Basically there family little boy Jake think t...,"['Basically', 'there', 'family', 'little', 'bo...",['Basically there family little boy Jake think...,"['B', 'as', 'ically', 'Ġthere', 'Ġfamily', 'Ġl...","['B', 'a', 's', 'i', 'c', 'a', 'l', 'l', 'y', ...",negative,sarcastic
...,...,...,...,...,...,...,...
6492,movie idea character development muscle less b...,"['movie', 'idea', 'character', 'development', ...",['movie idea character development muscle less...,"['movie', 'Ġidea', 'Ġcharacter', 'Ġdevelopment...","['m', 'o', 'v', 'i', 'e', ' ', 'i', 'd', 'e', ...",negative,sarcastic
6493,guess run budget decent script.,"['guess', 'run', 'budget', 'decent', 'script',...",['guess run budget decent script.'],"['gu', 'ess', 'Ġrun', 'Ġbudget', 'Ġdecent', 'Ġ...","['g', 'u', 'e', 's', 's', ' ', 'r', 'u', 'n', ...",negative,sarcastic
6494,need plot explosion every five minutes?,"['need', 'plot', 'explosion', 'every', 'five',...",['need plot explosion every five minutes?'],"['need', 'Ġplot', 'Ġexplosion', 'Ġevery', 'Ġfi...","['n', 'e', 'e', 'd', ' ', 'p', 'l', 'o', 't', ...",negative,sarcastic
6495,award generic action movie ever made?,"['award', 'generic', 'action', 'movie', 'ever'...",['award generic action movie ever made?'],"['aw', 'ard', 'Ġgeneric', 'Ġaction', 'Ġmovie',...","['a', 'w', 'a', 'r', 'd', ' ', 'g', 'e', 'n', ...",negative,sarcastic


In [17]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data[X],
    data[Y],
    test_size=0.2,
    random_state=42
)

In [19]:
# Function to vectorize reviews
def vectorize_reviews(reviews, vectorizer=None):
    if vectorizer is None:
        vectorizer = TfidfVectorizer()
        vectorized_reviews = vectorizer.fit_transform(reviews)
    else:
        vectorized_reviews = vectorizer.transform(reviews)
    return vectorized_reviews, vectorizer

# Evaluate each tokenization method
# Evaluate each tokenization method
def train_and_evaluate(tokenized_column):
    X_train_tokenized, vectorizer = vectorize_reviews(X_train[tokenized_column])
    X_test_tokenized = vectorize_reviews(X_test[tokenized_column], vectorizer)[0]

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_tokenized, y_train)
    y_pred = model.predict(X_test_tokenized)

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    print(f"Evaluation for {tokenized_column}:\n")
    print(f'Accuracy: {accuracy:.2f}')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('Classification Report:')
    print(class_report)

# Evaluate each tokenization method
for token_column in X_train.columns:
    train_and_evaluate(token_column)

Evaluation for word_token:

Accuracy: 0.83
Confusion Matrix:
[[545  59]
 [160 536]]
Classification Report:
               precision    recall  f1-score   support

not sarcastic       0.77      0.90      0.83       604
    sarcastic       0.90      0.77      0.83       696

     accuracy                           0.83      1300
    macro avg       0.84      0.84      0.83      1300
 weighted avg       0.84      0.83      0.83      1300

Evaluation for sent_token:

Accuracy: 0.83
Confusion Matrix:
[[539  65]
 [157 539]]
Classification Report:
               precision    recall  f1-score   support

not sarcastic       0.77      0.89      0.83       604
    sarcastic       0.89      0.77      0.83       696

     accuracy                           0.83      1300
    macro avg       0.83      0.83      0.83      1300
 weighted avg       0.84      0.83      0.83      1300

Evaluation for subword_token:

Accuracy: 0.83
Confusion Matrix:
[[545  59]
 [162 534]]
Classification Report:
          