In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
df = pd.read_csv("C:\\Users\\guded\\OneDrive\\Desktop\\INFOSYS\\train-balanced-sarcasm.csv")

In [3]:
df.shape

(203091, 8)

In [4]:
# Helper function to clean and convert embeddings from strings to numpy arrays
def clean_and_convert_embedding(embedding_str):
    try:
        # Ensure the string is properly formatted with commas between numbers
        clean_str = embedding_str.replace('\n', ' ').replace('[ ', '[').replace(' ]', ']').replace('  ', ' ')
        clean_str = ','.join(clean_str.split())  # Ensure commas are placed correctly
        return np.array(ast.literal_eval(clean_str))
    except Exception as e:
        print(f"Error parsing embedding: {embedding_str}")
        raise e

In [5]:
# Apply the helper function to the DataFrame
df['word2vec_embeddings'] = df['word2vec_embeddings'].apply(clean_and_convert_embedding)

In [5]:
# Verify the conversion
print(df['word2vec_embeddings'].head())

0    [-0.249028446, -0.112276957, -0.00595155568, 0...
1    [-0.06435088, -0.14805339, 0.32883312, 0.62132...
2    [0.1968625, 0.05365723, 0.03638186, 0.10795132...
3    [-0.14819673, 0.18820012, 0.08938915, 0.426148...
4    [0.21537142, 0.49409585, 1.07777257, 0.8793004...
Name: word2vec_embeddings, dtype: object


In [6]:
# Select a subset of the dataset
subset_df = df.sample(frac=0.1, random_state=42)  # Adjust the fraction as needed

In [7]:
# Prepare the data
X = np.array(subset_df['word2vec_embeddings'].tolist())
y = subset_df['label'].values

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define the SVM model with RBF kernel
svm_model = SVC(kernel='rbf', random_state=42)

In [10]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1, 0.1, 0.01]
}

In [11]:
# Initialize GridSearchCV
grid_search = GridSearchCV(svm_model, param_grid, refit=True, verbose=3, cv=3, n_jobs=-1)

In [12]:
# Measure the start time
start_time = time.time()

In [13]:
# Fit the model on the training data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [14]:
# Measure the end time
end_time = time.time()

In [16]:
# Calculate the execution time
execution_time = end_time - start_time

In [30]:
print('execution time (in seconds) is:',execution_time)

execution time (in seconds) is: 267.1816930770874


In [24]:
# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_svm_model = grid_search.best_estimator_

In [25]:
print(f"Best Parameters: {best_params}")

Best Parameters: {'C': 10, 'gamma': 0.01}


In [26]:
# Predict on the test data
y_pred = best_svm_model.predict(X_test)

In [27]:
# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.6619891678975874
              precision    recall  f1-score   support

           0       0.67      0.83      0.74      2361
           1       0.65      0.43      0.52      1701

    accuracy                           0.66      4062
   macro avg       0.66      0.63      0.63      4062
weighted avg       0.66      0.66      0.65      4062



In [28]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")

Confusion Matrix:
[[1960  401]
 [ 972  729]]


In [29]:
# Display the best model's parameters
print("Best Model's Parameters:", best_svm_model.get_params())

Best Model's Parameters: {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.01, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False}
