In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score

# Load the dataset
data = pd.read_csv('balanced_dataset_50000.csv')

# Drop rows with NaN values
data = data.dropna()

# Preprocess the dataset
X = data['comment']
y = data['label']

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to strings to handle potential float values
X_train = X_train.astype(str)
X_test = X_test.astype(str)

# Tokenize and transform sequences using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# Random Forest Model

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# Initialize Random Forest model with desired hyperparameters
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)

# Train Random Forest model
rf_model.fit(X_train_tfidf, y_train)

# Evaluate Random Forest model
train_accuracy_rf = accuracy_score(y_train, rf_model.predict(X_train_tfidf))
test_accuracy_rf = accuracy_score(y_test, rf_model.predict(X_test_tfidf))
y_pred_rf = rf_model.predict(X_test_tfidf)
test_precision_rf = precision_score(y_test, y_pred_rf)

rf_results = {
    'train_accuracy': train_accuracy_rf,
    'test_accuracy': test_accuracy_rf,
    'test_precision': test_precision_rf
}

# Print accuracies
print(f"Training Accuracy: {train_accuracy_rf:.4f}")
print(f"Test Accuracy: {test_accuracy_rf:.4f}")
print(f"Test Precision: {test_precision_rf:.4f}")

Training Accuracy: 0.6675
Test Accuracy: 0.6282
Test Precision: 0.6749


#### Parallelization: Ensure that your environment is configured to utilize multiple CPU cores (n_jobs=-1 in RandomForestClassifier) for parallel processing. This can speed up training time significantly, especially on multicore machines.

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# Initialize Random Forest model with reduced parameters
rf_model = RandomForestClassifier(n_estimators=50, max_depth=7, random_state=42, n_jobs=-1)

# Subset of data for quicker testing
# Example: X_train_tfidf_sub, y_train_sub, X_test_tfidf_sub, y_test_sub = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model.fit(X_train_tfidf, y_train)

# Evaluate Random Forest model
train_accuracy_rf = accuracy_score(y_train, rf_model.predict(X_train_tfidf))
test_accuracy_rf = accuracy_score(y_test, rf_model.predict(X_test_tfidf))
y_pred_rf = rf_model.predict(X_test_tfidf)
test_precision_rf = precision_score(y_test, y_pred_rf)

# Print accuracies
print(f"Training Accuracy: {train_accuracy_rf:.4f}")
print(f"Test Accuracy: {test_accuracy_rf:.4f}")
print(f"Test Precision: {test_precision_rf:.4f}")


Training Accuracy: 0.6413
Test Accuracy: 0.6140
Test Precision: 0.6666


# Decision tree model

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score

# Initialize Decision Tree model with reduced parameters
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)

# Train Decision Tree model
dt_model.fit(X_train_tfidf, y_train)

# Evaluate Decision Tree model
train_accuracy_dt = accuracy_score(y_train, dt_model.predict(X_train_tfidf))
test_accuracy_dt = accuracy_score(y_test, dt_model.predict(X_test_tfidf))
y_pred_dt = dt_model.predict(X_test_tfidf)
test_precision_dt = precision_score(y_test, y_pred_dt)

dt_results = {
    'train_accuracy': train_accuracy_dt,
    'test_accuracy': test_accuracy_dt,
    'test_precision': test_precision_dt
}

# Print accuracies
print(f"Training Accuracy: {train_accuracy_dt:.4f}")
print(f"Test Accuracy: {test_accuracy_dt:.4f}")
print(f"Test Precision: {test_precision_dt:.4f}")


Training Accuracy: 0.5575
Test Accuracy: 0.5506
Test Precision: 0.8551


# Logistic Regression Model

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score

# Initialize Logistic Regression model
lr_model = LogisticRegression(max_iter=10)

# Train Logistic Regression model
lr_model.fit(X_train_tfidf, y_train)

# Evaluate Logistic Regression model
train_accuracy_lr = accuracy_score(y_train, lr_model.predict(X_train_tfidf))
test_accuracy_lr = accuracy_score(y_test, lr_model.predict(X_test_tfidf))
y_pred_lr = lr_model.predict(X_test_tfidf)
test_precision_lr = precision_score(y_test, y_pred_lr)

# Print accuracies
print(f"Training Accuracy: {train_accuracy_lr:.4f}")
print(f"Test Accuracy: {test_accuracy_lr:.4f}")
print(f"Test Precision: {test_precision_lr:.4f}")

# Store results in dictionary
lr_results = {
    'train_accuracy': train_accuracy_lr,
    'test_accuracy': test_accuracy_lr,
    'test_precision': test_precision_lr
}


Training Accuracy: 0.7300
Test Accuracy: 0.6458
Test Precision: 0.6659


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# XGBoost Model

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import precision_score
from sklearn.feature_selection import SelectFromModel

# Reduce feature dimensionality
selector = SelectFromModel(estimator=XGBClassifier(), max_features=1000)
X_train_tfidf_selected = selector.fit_transform(X_train_tfidf, y_train)
X_test_tfidf_selected = selector.transform(X_test_tfidf)

# Initialize XGBoost classifier
xgb_model = XGBClassifier(tree_method='approx', n_jobs=-1)

# Set number of epochs
epochs = 10

# Train and evaluate XGBoost model
for epoch in range(epochs):
    xgb_model.fit(X_train_tfidf_selected, y_train, verbose=True if epoch == epochs - 1 else False)

# Evaluate model performance
train_accuracy_xgb = xgb_model.score(X_train_tfidf_selected, y_train)
test_accuracy_xgb = xgb_model.score(X_test_tfidf_selected, y_test)
y_pred_xgb = xgb_model.predict(X_test_tfidf_selected)
test_precision_xgb = precision_score(y_test, y_pred_xgb)

# Gather results
xgb_results = {
    'train_accuracy': train_accuracy_xgb,
    'test_accuracy': test_accuracy_xgb,
    'test_precision': test_precision_xgb
}

print("XGBoost Model Results:")
print(xgb_results)

XGBoost Model Results:
{'train_accuracy': 0.690792464265872, 'test_accuracy': 0.6318181818181818, 'test_precision': 0.7072058376406203}


# SVC Model

When considering whether to use `SVC` (Support Vector Classifier) from `sklearn.svm` for sarcasm detection, here are some key reasons to prefer `SVC` over other approaches within the context of sarcasm detection:

### Reasons to Use `SVC`:

1. **Effectiveness in High-Dimensional Spaces:**
   `SVC` is known for its effectiveness in high-dimensional spaces, which is particularly relevant when dealing with text data that has been transformed into TF-IDF features. The high dimensionality of the feature space can be efficiently handled by the SVC algorithm.

2. **Robustness to Overfitting:**
   SVMs, including `SVC`, have regularization parameters (`C` in `SVC`) that help prevent overfitting. This is crucial for sarcasm detection, where the model needs to generalize well to new, unseen examples of sarcasm without being too specific to the training data.

3. **Kernel Trick:**
   `SVC` supports the use of kernel functions, which can map the original features into higher-dimensional spaces where a linear separation is possible. This is beneficial for sarcasm detection, as the relationship between features and labels may not be linearly separable in the original feature space. Using kernels like the RBF (Radial Basis Function) can capture complex patterns in the data.

4. **Performance:**
   Empirical evidence often shows that `SVC` performs well in text classification tasks, including sarcasm detection. The `SVC` algorithm can achieve high accuracy and precision, making it a strong candidate for this type of task.

5. **Scalability:**
   While SVMs can be computationally intensive, `SVC` implementations in `scikit-learn` are optimized for performance. They can handle reasonably large datasets efficiently, especially when combined with techniques like grid search for hyperparameter tuning.

### Comparison with Other Approaches:

- **Logistic Regression:** While logistic regression is simpler and faster, it might not capture the complex patterns in sarcasm detection as effectively as `SVC` with non-linear kernels.
- **Decision Trees/Random Forests:** These models can capture non-linear patterns but may require extensive tuning and can be prone to overfitting. They also may not perform as well in high-dimensional spaces compared to `SVC`.
- **XGBoost:** While powerful and often performing well in various tasks, XGBoost might be more complex to tune and more computationally intensive compared to `SVC`.

### Conclusion:

**SVC** is a strong candidate for sarcasm detection due to its ability to handle high-dimensional feature spaces, robustness against overfitting, support for non-linear classification through kernel functions, and demonstrated empirical performance in text classification tasks.

In [7]:
from sklearn.svm import SVC
from sklearn.metrics import precision_score

# Initialize SVC model
svc_model = SVC()

# Fit the model once on the entire training set
svc_model.fit(X_train_tfidf, y_train)

# Evaluate model performance
train_accuracy_svc = svc_model.score(X_train_tfidf, y_train)
test_accuracy_svc = svc_model.score(X_test_tfidf, y_test)
y_pred_svc = svc_model.predict(X_test_tfidf)
test_precision_svc = precision_score(y_test, y_pred_svc)

svc_results = {
    'train_accuracy': train_accuracy_svc,
    'test_accuracy': test_accuracy_svc,
    'test_precision': test_precision_svc
}

print("Train Accuracy:", train_accuracy_svc)
print("Test Accuracy:", test_accuracy_svc)
print("Test Precision:", test_precision_svc)

Train Accuracy: 0.9129501489974241
Test Accuracy: 0.6523232323232323
Test Precision: 0.6750111756817165


# Comparison and Best Fit Model

In [12]:
# Compare models
models = {
    'Random Forest Model': rf_results,
    'Decision Tree Model': dt_results,
    'Logistic Regression Model': lr_results,
    'XGBoost Model': xgb_results,
    'SVC': svc_results
}

best_model = max(models, key=lambda x: models[x]['test_accuracy'])

print(f'\nBest Model: {best_model}')
print(f"Training Accuracy: {models[best_model]['train_accuracy'] * 100:.2f}%")
print(f"Test Accuracy: {models[best_model]['test_accuracy'] * 100:.2f}%")
print(f"Test Precision: {models[best_model]['test_precision'] * 100:.2f}%")


Best Model: SVC
Training Accuracy: 91.30%
Test Accuracy: 65.23%
Test Precision: 67.50%


# Summary

### Steps taken
1. **Data Loading and Preprocessing:**
   The notebook starts by importing essential libraries such as `pandas` and `sklearn`. The dataset (`balanced_dataset_50000.csv`) is loaded and rows with missing values are dropped. The features (`comment`) and labels (`label`) are extracted, and the labels are encoded using `LabelEncoder`. The data is then split into training and testing sets using `train_test_split`.

2. **Text Vectorization:**
   To prepare the text data for model training, the `TfidfVectorizer` is used to convert the text data into TF-IDF features. This step transforms the text data into a numerical format suitable for machine learning models.

3. **Model Training and Evaluation:**
   The notebook proceeds to train and evaluate several machine learning models:
   - **Random Forest Model:** A Random Forest model is initialized with specific hyperparameters and trained on the TF-IDF features. The model's performance is evaluated based on training accuracy, test accuracy, and precision.
   - **Decision Tree Model:** Similarly, a Decision Tree model is initialized, trained, and evaluated.
   - **Logistic Regression Model:** A Logistic Regression model undergoes the same process of initialization, training, and evaluation.
   - **XGBoost Model:** An XGBoost model is trained and its performance is evaluated in terms of training accuracy, test accuracy, and precision.
   - **SVC (Support Vector Classifier) Model:** An SVC model is also trained and evaluated using the same metrics.

4. **Model Comparison:**
   After training and evaluating all the models, the notebook compares their performance based on test accuracy. The model with the highest test accuracy is identified as the best-performing model.

### Conclusion:
Among all the models evaluated, the **SVC Model** demonstrated the best performance. The performance metrics for the SVC Model include:
- **Training Accuracy:** 91.30%
- **Test Accuracy:** 65.23%
- **Test Precision:** 67.50%

These metrics indicate the model's effectiveness in generalizing to unseen data, making it the most suitable model among those tested in the notebook.