In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure plots are displayed inline
%matplotlib inline


In [2]:
# Load the dataset
file_path = 'cleaned_balanced_dataset_FINAL.csv'  # Update this with your actual file path
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())
print(data.info())
print(data['label'].value_counts())


   label                                            comment
0      1                                               need
1      0                               might well milk last
2      1                                       ask locktrap
3      1  im glad community doesnt make console player f...
4      0                                    joke put stitch
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130030 entries, 0 to 130029
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   label    130030 non-null  int64 
 1   comment  128697 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.0+ MB
None
label
1    65015
0    65015
Name: count, dtype: int64


In [3]:
# Handle missing values by dropping rows with missing comments
data = data.dropna(subset=['comment'])


In [4]:
# Preprocess the text data
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X = tfidf.fit_transform(data['comment'])
y = data['label']


In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Initialize models
models = {
    'SVM': SVC(kernel='linear', random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Train and evaluate models
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy
    print(f"{model_name} Accuracy: {accuracy:.4f}")

# Identify the best model
best_model_name = max(results, key=results.get)
best_model_accuracy = results[best_model_name]
print(f"\nBest Model: {best_model_name} with Accuracy: {best_model_accuracy:.4f}")


In [None]:
# If desired, perform hyperparameter tuning for the best model
if best_model_name == 'SVM':
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }
    best_model = SVC(random_state=42)
elif best_model_name == 'Logistic Regression':
    param_grid = {
        'C': [0.1, 1, 10, 100]
    }
    best_model = LogisticRegression(max_iter=1000, random_state=42)
elif best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [10, 50, 100],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    best_model = RandomForestClassifier(random_state=42)
elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2]
    }
    best_model = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(best_model, param_grid, refit=True, verbose=3, cv=5)
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters for the best model:", grid_search.best_params_)

# Predict on the test set with the best estimator
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model ({best_model_name}) with Hyperparameter Tuning Accuracy: {accuracy:.4f}")

# Generate the classification report and confusion matrix
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display the classification report
print("Classification Report:\n", class_report)


In [None]:
# Function to plot the confusion matrix
def plot_confusion_matrix(conf_matrix, class_names):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

# Class names for the confusion matrix
class_names = ['Not Sarcastic', 'Sarcastic']

# Plot the confusion matrix
plot_confusion_matrix(conf_matrix, class_names)
