<a href="https://colab.research.google.com/github/lokeshshekapuram/Sarcasm_Detection/blob/main/Source_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Required Libraries


In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score

Loading data


In [2]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

Training Linear SVC Model


In [3]:
def train_models(df):
    # Extract context and response (utterance) from DataFrame
    contexts = df['context'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)  # Ensure context is a string
    responses = df['utterance']

    # Combine context and response into a single feature for training
    X = contexts + ' ' + responses
    y = df['sarcasm'].astype(int)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define a pipeline for text classification using CountVectorizer and a classifier
    pipeline = Pipeline([
        ('vect', CountVectorizer()),  # Convert text to token counts
        ('tfidf', TfidfTransformer()),  # Apply TF-IDF transformation
        ('clf', LinearSVC())  # Linear Support Vector Classifier
    ])

    # Train the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Evaluate the model on the test data
    y_pred = pipeline.predict(X_test)
    
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1 Score: {f1:.2f}")

    # Return the trained pipeline
    return pipeline# Specify the correct file path to your JSON data file



In [4]:
# Predict sarcasm in a new sentence
def predict_sarcasm(sentence, model):
    prediction = model.predict([sentence])
    return prediction[0]

ACCURACY CALCULATION

In [5]:
# Specify the correct file path to your JSON data file
file_path = r'C:\Users\ASUS\Downloads\Sarcasm_Detection-main\Sarcasm_Detection-main\csvjson.json'

# Load JSON data and convert to DataFrame
data = load_data(file_path)
df = pd.DataFrame(data)  # No transpose needed

# Train models using the DataFrame
trained_model = train_models(df)

Accuracy: 0.88
F1 Score: 0.93


In [6]:
# Example usage: Detect sarcasm in a given sentence
input_sentence = "Since it's not bee season, you can have my epinephrine."
is_sarcastic = predict_sarcasm(input_sentence, trained_model)

if is_sarcastic:
    print("sarcastic.")
else:
    print("not sarcastic.")

sarcastic.


In [7]:
# Example usage: Detect sarcasm in a given sentence
input_sentence = "I think I'm gonna go. Thank you for the burrito and the pork rinds and the 20-minute lecture on why monster trucks are better than regular trucks."
is_sarcastic = predict_sarcasm(input_sentence, trained_model)

if is_sarcastic:
    print("sarcastic.")
else:
    print("not sarcastic.")

sarcastic.


This just used text vectorization and works using the pre-trained data and it can determine for texts outside the dataset also For example:

In [8]:
# Example usage: Detect sarcasm in a given sentence
input_sentence = "Oh, this is exactly what I need today."
is_sarcastic = predict_sarcasm(input_sentence, trained_model)

if is_sarcastic:
    print("sarcastic.")
else:
    print("not sarcastic.")

sarcastic.


In [9]:
# Example usage: Detect sarcasm in a given sentence
input_sentence = "I am a good student.I really like to submit assignments on time"
is_sarcastic = predict_sarcasm(input_sentence, trained_model)

if is_sarcastic:
    print("sarcastic.")
else:
    print("not sarcastic.")

sarcastic.


SVM

In [10]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [11]:

# Step 1: Read JSON file and extract relevant information
def read_json_file(data):
    with open(data, 'r') as file:
        data = json.load(file)
    return data

data = read_json_file('csvjson.json')


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming `data` is a list of dictionaries
utterances = []
labels = []

# Loop through the list of dictionaries
for item in data:
    utterances.append(item['utterance'])
    labels.append(item['sarcasm'])

vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X = vectorizer.fit_transform(utterances)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Step 5: Evaluate the model
predictions = svm_classifier.predict(X_test)
print(classification_report(y_test, predictions))

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.88      1.00      0.93         7

    accuracy                           0.88         8
   macro avg       0.44      0.50      0.47         8
weighted avg       0.77      0.88      0.82         8

Accuracy: 0.875


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest and Logistic Regression

In [23]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score

Load and preprocess the data

In [15]:

# Step 1: Load and preprocess the data
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def preprocess_data(data):
    X = []
    y = []
    for key, item in data.items():
        X.append(" ".join(item['context']) + " " + item['utterance'])
        y.append(int(item['sarcasm']))
    return X, np.array(y)

Extracting features using TF-IDF

In [41]:
def extract_features(X_train, X_val):
    """
    Transform text data into TF-IDF features.
    """
    vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features if necessary
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    return X_train_tfidf, X_val_tfidf


In [45]:
def train_and_evaluate_classifier(X_train, y_train, X_val, y_val, classifier='random_forest'):
    """
    Train and evaluate a classifier on the given data.
    """
    if classifier == 'random_forest':
        clf = RandomForestClassifier(n_estimators=100, random_state=42)
    elif classifier == 'logistic_regression':
        clf = LogisticRegression(max_iter=1000, random_state=42)
    else:
        raise ValueError("Invalid classifier specified.")

    # Train the model
    clf.fit(X_train, y_train)
    # Predict on validation data
    y_pred = clf.predict(X_val)

    # Evaluate the model
    print(f"Results for {classifier}:")
    print(classification_report(y_val, y_pred))


Loading and preprocessing data

In [43]:
def preprocess_data(data):
    X = []
    y = []
    # Iterate over each item in the list
    for item in data:
        X.append(" ".join(item['context']) + " " + item['utterance'])
        y.append(int(item['sarcasm']))
    return X, y


Training and Evaluating Classifiers

Random Forest


In [47]:
# File path to your JSON dataset
file_path = r'C:\Users\ASUS\Downloads\Sarcasm_Detection-main\Sarcasm_Detection-main\csvjson.json'

# Step 1: Load data
data = load_data(file_path)

# Step 2: Preprocess data
X, y = preprocess_data(data)

# Step 3: Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Extract features using TF-IDF
X_train_tfidf, X_val_tfidf = extract_features(X_train, X_val)

# Step 5: Train and evaluate classifiers
print("Using Random Forest Classifier:")
train_and_evaluate_classifier(X_train_tfidf, y_train, X_val_tfidf, y_val, classifier='random_forest')

print("\nUsing Logistic Regression Classifier:")
train_and_evaluate_classifier(X_train_tfidf, y_train, X_val_tfidf, y_val, classifier='logistic_regression')


Using Random Forest Classifier:
Results for random_forest:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.88      1.00      0.93         7

    accuracy                           0.88         8
   macro avg       0.44      0.50      0.47         8
weighted avg       0.77      0.88      0.82         8


Using Logistic Regression Classifier:
Results for logistic_regression:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.88      1.00      0.93         7

    accuracy                           0.88         8
   macro avg       0.44      0.50      0.47         8
weighted avg       0.77      0.88      0.82         8



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


BERT

In [2]:
import json
import numpy as np
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, classification_report

Loading and Preprocessing data


In [8]:

# Define the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load data
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Preprocess data
def preprocess_data(data):
    X = [item['utterance'] for item in data]  # Access items directly as the data is a list
    y = [int(item['sarcasm']) for item in data]
    return X, y



BERT Implementation

In [9]:

# Tokenize data using BERT tokenizer
def tokenize_data(texts, labels, tokenizer, max_length):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens = True,
                            max_length = max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask = True,
                            return_tensors = 'pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

# Define BERT model and optimizer
def initialize_model():
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels = 2,
        output_attentions = False,
        output_hidden_states = False
    )
    optimizer = AdamW(model.parameters(), lr = 2e-5)
    return model, optimizer

# Fine-tune BERT model
def fine_tune_BERT_model(model, optimizer, train_dataloader, val_dataloader, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        avg_train_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1}/{epochs}, Average training loss: {avg_train_loss}')


Function for Model Evaluation

In [10]:

# Evaluate model
def evaluate_model(model, val_dataloader):
    model.eval()
    val_preds = []
    val_labels = []
    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        labels = b_labels.cpu().numpy().tolist()
        val_preds.extend(preds)
        val_labels.extend(labels)
    accuracy = accuracy_score(val_labels, val_preds)
    print(f'Validation accuracy: {accuracy }')
    print(classification_report(val_labels, val_preds))


In [16]:
# Main function
def main():
    # Load data
    data = load_data('csvjson.json')

    # Preprocess data
    X, y = preprocess_data(data)

    # Tokenize data using BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    max_length = 128

    # Tokenize and prepare data
    input_ids, attention_masks, labels = tokenize_data(X, y, tokenizer, max_length)

    # Use k-fold cross-validation
    num_folds = 5
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold = 0
    for train_index, val_index in skf.split(input_ids, labels):
        fold += 1
        print(f'Fold {fold}/{num_folds}')
        train_inputs, train_masks, train_labels = input_ids[train_index], attention_masks[train_index], labels[train_index]
        val_inputs, val_masks, val_labels = input_ids[val_index], attention_masks[val_index], labels[val_index]

        # Create dataloaders
        train_data = TensorDataset(train_inputs, train_masks, train_labels)
        val_data = TensorDataset(val_inputs, val_masks, val_labels)
        batch_size = 32
        train_sampler = RandomSampler(train_data)
        val_sampler = SequentialSampler(val_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
        val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

        # Initialize BERT model
        model, optimizer = initialize_model()
        model.to(device)

        # Fine-tune BERT model
        fine_tune_BERT_model(model, optimizer, train_dataloader, val_dataloader)

        # Evaluate model
        evaluate_model(model, val_dataloader)

if __name__ == "__main__":
    main()


ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.

BERT AND GLOVE

NameError: name 'python' is not defined

Importing Required Libraries


In [34]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score


ModuleNotFoundError: No module named 'transformers'

Load And Pre Process the DataSet

In [None]:
class SarcasmDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = str(self.data.iloc[idx]['context'])
        utterance = str(self.data.iloc[idx]['utterance'])
        text = context + ' ' + utterance
        label = self.data.iloc[idx]['sarcasm']
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }
        return item


In [None]:

def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

class SarcasmClassifier(torch.nn.Module):
    def __init__(self):
        super(SarcasmClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.1)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits


In [None]:

file_path = r'C:\Users\ASUS\Downloads\Sarcasm_Detection-main\Sarcasm_Detection-main\csvjson.json'
data = load_data(file_path)
df = pd.DataFrame(data).T
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = SarcasmDataset(df, tokenizer)
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)


NameError: name 'BertTokenizer' is not defined

Traning

In [None]:

def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].float().to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)


Evaluation

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SarcasmClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.BCEWithLogitsLoss()
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

for epoch in range(5):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}')

def evaluate_model(model, test_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].float()
            outputs = model(input_ids, attention_mask)
            predicted_labels = (torch.sigmoid(outputs) > 0.5).int()
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    print(f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}')

test_loader = DataLoader(test_data, batch_size=32, shuffle=False)
evaluate_model(model, test_loader, device)