In [1]:
# Cell 1: Load and preprocess the dataset
import pandas as pd

# Load the dataset
file_path = 'balanced_dataset_20000.csv'
data = pd.read_csv(file_path)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Print dataset info
print(f'Number of samples: {data.shape[0]}')
print(f'Shape of the dataset: {data.shape}')
print(f'Null values:\n{data.isnull().sum()}')


Number of samples: 20000
Shape of the dataset: (20000, 2)
Null values:
label      0
comment    0
dtype: int64


In [2]:
# Cell 2: Preprocessing and Vectorization
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Define features and labels
X = data['comment']
y = data['label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to strings to handle potential float values
X_train = X_train.astype(str)
X_test = X_test.astype(str)

# Vectorize using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [3]:
# Cell 3: Model Training and Evaluation
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Train the SVC model
svc_model = SVC(kernel='linear', C=1, random_state=42)
svc_model.fit(X_train_tfidf, y_train)

# Predict on test set
y_pred = svc_model.predict(X_test_tfidf)

# Evaluate the model
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Precision: 0.6471838469713072
Recall: 0.5938566552901023
F1 Score: 0.6193745232646835
Confusion Matrix:
[[1285  664]
 [ 833 1218]]
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.66      0.63      1949
           1       0.65      0.59      0.62      2051

    accuracy                           0.63      4000
   macro avg       0.63      0.63      0.63      4000
weighted avg       0.63      0.63      0.63      4000



In [7]:
# Input texts
input_texts = [
    "I just love waiting in long lines. It's my favorite pastime.",  # Sarcastic
    "The sky is clear and blue today.",  # Non-sarcastic
    "Wow, another rainy day. Just what I needed.",  # Sarcastic
    "The project meeting went well, and we made a lot of progress.",  # Non-sarcastic
    "I'm so excited for Monday morning meetings.",  # Sarcastic
    "Had a great time at the family reunion.",  # Non-sarcastic
    "The food here is amazing, said no one ever.",  # Sarcastic
    "It's nice to see everyone working together.",  # Non-sarcastic
    "I can't wait to do my taxes. It's so much fun.",  # Sarcastic
    "Thanks for helping me with the presentation.",  # Non-sarcastic
]

# Transform the input texts using the TF-IDF vectorizer
input_texts_tfidf = tfidf_vectorizer.transform(input_texts)

# Predict labels for the input texts
predictions = svc_model.predict(input_texts_tfidf)

# Print the predictions
for i, (text, prediction) in enumerate(zip(input_texts, predictions)):
    label = "Sarcastic" if prediction == 1 else "Non-sarcastic"
    print(f"Comment {i + 1}: {text} - {label}")


Comment 1: I just love waiting in long lines. It's my favorite pastime. - Non-sarcastic
Comment 2: The sky is clear and blue today. - Sarcastic
Comment 3: Wow, another rainy day. Just what I needed. - Sarcastic
Comment 4: The project meeting went well, and we made a lot of progress. - Non-sarcastic
Comment 5: I'm so excited for Monday morning meetings. - Non-sarcastic
Comment 6: Had a great time at the family reunion. - Non-sarcastic
Comment 7: The food here is amazing, said no one ever. - Sarcastic
Comment 8: It's nice to see everyone working together. - Sarcastic
Comment 9: I can't wait to do my taxes. It's so much fun. - Sarcastic
Comment 10: Thanks for helping me with the presentation. - Sarcastic


# Improved Preprocessing and Model Training

In [1]:
# Improved Preprocessing and Model Training

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load the dataset
file_path = 'balanced_dataset_20000.csv'
data = pd.read_csv(file_path)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Define features and labels
X = data['comment'].astype(str)
y = data['label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [2]:
# Create a pipeline with TF-IDF vectorizer and SVC
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1, 2), stop_words='english')),
    ('svc', SVC(random_state=42))
])

In [3]:
# Define parameter grid for GridSearchCV
param_grid = {
    'svc__kernel': ['linear', 'rbf'],
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 'auto']
}

In [4]:
# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [5]:
# Predict on test set with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


In [6]:
# Evaluate the model
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Precision: 0.5322867298578199
Recall: 0.9075757575757576
F1 Score: 0.6710231516056758
Confusion Matrix:
[[ 441 1579]
 [ 183 1797]]
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.22      0.33      2020
           1       0.53      0.91      0.67      1980

    accuracy                           0.56      4000
   macro avg       0.62      0.56      0.50      4000
weighted avg       0.62      0.56      0.50      4000



In [7]:
# Input texts (mix of sarcastic and non-sarcastic)
input_texts = [
    "I just love waiting in long lines. It's my favorite pastime.",  # Sarcastic
    "The sky is clear and blue today.",  # Non-sarcastic
    "Wow, another rainy day. Just what I needed.",  # Sarcastic
    "The project meeting went well, and we made a lot of progress.",  # Non-sarcastic
    "I'm so excited for Monday morning meetings.",  # Sarcastic
    "Had a great time at the family reunion.",  # Non-sarcastic
    "The food here is amazing, said no one ever.",  # Sarcastic
    "It's nice to see everyone working together.",  # Non-sarcastic
    "I can't wait to do my taxes. It's so much fun.",  # Sarcastic
    "Thanks for helping me with the presentation.",  # Non-sarcastic
]

# Transform the input texts using the TF-IDF vectorizer from the best model
input_texts_tfidf = best_model.named_steps['tfidf'].transform(input_texts)

# Predict labels for the input texts
predictions = best_model.named_steps['svc'].predict(input_texts_tfidf)

# Print the predictions
for i, (text, prediction) in enumerate(zip(input_texts, predictions)):
    label = "Sarcastic" if prediction == 1 else "Non-sarcastic"
    print(f"Comment {i + 1}: {text} - {label}")


Comment 1: I just love waiting in long lines. It's my favorite pastime. - Non-sarcastic
Comment 2: The sky is clear and blue today. - Sarcastic
Comment 3: Wow, another rainy day. Just what I needed. - Sarcastic
Comment 4: The project meeting went well, and we made a lot of progress. - Sarcastic
Comment 5: I'm so excited for Monday morning meetings. - Sarcastic
Comment 6: Had a great time at the family reunion. - Sarcastic
Comment 7: The food here is amazing, said no one ever. - Sarcastic
Comment 8: It's nice to see everyone working together. - Non-sarcastic
Comment 9: I can't wait to do my taxes. It's so much fun. - Sarcastic
Comment 10: Thanks for helping me with the presentation. - Non-sarcastic


# using SVM

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [2]:
# Function to preprocess text (example: lowercase conversion)
def preprocess_text(text):
    processed_text = text.lower()  # Example: Convert text to lowercase
    return processed_text

# Function to compute mock sentiment scores (replace with actual sentiment analysis if available)
def compute_sentiment_scores(text):
    return 0.5, 0.2  # Example mock scores

# Function to add sentiment features to TF-IDF vectors
def add_sentiment_features(X_tfidf, comments):
    sentiment_features = []
    for comment in comments:
        positive_score, negative_score = compute_sentiment_scores(comment)
        sentiment_feature = [positive_score, negative_score]
        sentiment_features.append(sentiment_feature)
    
    # Convert sentiment features to numpy array
    sentiment_features_array = np.array(sentiment_features)
    
    # Concatenate sentiment features with TF-IDF vectors
    X_combined = np.concatenate((X_tfidf.toarray(), sentiment_features_array), axis=1)
    
    return X_combined


In [3]:
# Load dataset
dataset = pd.read_csv('balanced_dataset_20000.csv')

# Preprocess comments
dataset['comment'] = dataset['comment'].apply(preprocess_text)

# Split dataset into training and test sets (example: 80% train, 20% test)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset['comment'], dataset['label'], test_size=0.2, random_state=42)


In [4]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

# Fit vectorizer on training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform test data using fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)


In [5]:
# Add sentiment features to TF-IDF vectors for training set
X_train_combined = add_sentiment_features(X_train_tfidf, X_train)

# Add sentiment features to TF-IDF vectors for test set
X_test_combined = add_sentiment_features(X_test_tfidf, X_test)


In [None]:
# Initialize SVM classifier
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# Train the SVM model on combined feature vectors
svm_model.fit(X_train_combined, y_train)


In [None]:
# New input texts for prediction
input_texts = [
    "I just love waiting in long lines. It's my favorite pastime.",
    "The sky is clear and blue today.",
    "Wow, another rainy day. Just what I needed.",
    "The project meeting went well, and we made a lot of progress.",
    "I'm so excited for Monday morning meetings.",
    "Had a great time at the family reunion.",
    "The food here is amazing, said no one ever.",
    "It's nice to see everyone working together.",
    "I can't wait to do my taxes. It's so much fun.",
    "Thanks for helping me with the presentation."
]

# Preprocess input texts
input_texts_processed = [preprocess_text(text) for text in input_texts]

# Vectorize input_texts using the trained TF-IDF vectorizer
X_input_tfidf = vectorizer.transform(input_texts_processed)

# Add sentiment features to TF-IDF vectors for input_texts
X_input_combined = add_sentiment_features(X_input_tfidf, input_texts_processed)

# Predict using the trained SVM model
y_pred_input = svm_model.predict(X_input_combined)

# Map predictions to labels
label_map = {1: "Sarcastic", 0: "Non-sarcastic"}
predicted_labels = [label_map[label] for label in y_pred_input]

# Print predictions
for text, label in zip(input_texts, predicted_labels):
    print(f"Comment: {text} - Predicted: {label}")


# XGBOOST

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import TextClassificationPipeline
from sklearn.metrics import classification_report

# Load the dataset
file_path = 'balanced_dataset_20000.csv'
data = pd.read_csv(file_path)

# Preprocess the text data
def preprocess_text(text):
    return text.lower()

# Apply preprocessing to the comments
data['comment'] = data['comment'].apply(preprocess_text)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['comment'], data['label'], test_size=0.2, random_state=42)

In [2]:
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)

In [3]:
# Convert to torch dataset
class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SarcasmDataset(train_encodings, y_train.tolist())
test_dataset = SarcasmDataset(test_encodings, y_test.tolist())

In [4]:
# Load the DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=2,              
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=8,    
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    eval_strategy="epoch"
)


In [6]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset            
)

In [None]:
# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
report = classification_report(y_test, preds, target_names=['Non-sarcastic', 'Sarcastic'])
print(report)

In [None]:
# Predict new input texts
input_texts = [
    "I just love waiting in long lines. It's my favorite pastime.",
    "The sky is clear and blue today.",
    "Wow, another rainy day. Just what I needed.",
    "The project meeting went well, and we made a lot of progress.",
    "I'm so excited for Monday morning meetings.",
    "Had a great time at the family reunion.",
    "The food here is amazing, said no one ever.",
    "It's nice to see everyone working together.",
    "I can't wait to do my taxes. It's so much fun.",
    "Thanks for helping me with the presentation.",
]

# Tokenize input texts
input_encodings = tokenizer(input_texts, truncation=True, padding=True, max_length=512, return_tensors='pt')

# Create a TextClassificationPipeline for easy prediction
pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

# Get predictions
predictions = pipeline(input_texts)
for text, prediction in zip(input_texts, predictions):
    label = prediction[0]['label']
    print(f"Comment: {text} - Predicted: {label}")