In [20]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import string
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from collections import Counter

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# Combine the two features into one column
train["Combined"] = train["Description"] + " " + train["Title"]
test["Combined"] = test["Description"] + " " + test["Title"]

In [4]:
X_train = train["Combined"]
X_test = test["Combined"]
y_train = train["Class Index"]
y_test = test["Class Index"]

In [5]:
def clean_text(text):
    # Regular expression pattern to match HTML tags and HTML entities
    regex_html = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

    # Translation table to remove digits and punctuation marks
    remove_digits = str.maketrans('', '', string.digits + string.punctuation)

    # Remove HTML tags and entities from the text
    text = re.sub(regex_html, '', text)

    # Remove digits and punctuation marks from the text
    text = text.translate(remove_digits)

    # Remove special characters, URLs, and usernames from the text
    text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text)

    # Split the text into individual words, remove extra whitespace, and convert to lowercase
    text = ' '.join(text.split()).lower()

    return text

In [None]:
# Apply data cleaning
X_train = X_train.apply(clean_text)
X_test = X_test.apply(clean_text)

In [None]:
# To see average number of words in a row after cleaning among the 4 classes
for i in range(1,5):
    print(np.mean(list(X_train[y_train==i].apply(lambda x: len(x.split())))))

In [7]:
# create TF-IDF vectorizer and fit on preprocessed text data
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [12]:
# Define the parameter grid to search over
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]
}

# Create the SVM classifier
svc = SVC()

In [14]:
# Initialize a list to store the best hyperparameters for each batch
best_params_list = []

batch_size = 1000

num_batches = X_train_tfidf.shape[0] // batch_size

# Loop through the batches
for i in range(num_batches):
    # Get the batch indices
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_train_tfidf.shape[0])

    # Get the batch data
    X_batch = X_train_tfidf[start_idx:end_idx]
    y_batch = y_train[start_idx:end_idx]

    # Perform grid search using cross-validation on the batch data
    grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=3)
    grid_search.fit(X_batch, y_batch)

    # Retrieve the best hyperparameters and append them to the list
    best_params_list.append(tuple(sorted(grid_search.best_params_.items())))

In [None]:
# Compute the most common hyperparameters among all batches

# Get the most common set of hyperparameters
common_params = dict(Counter(best_params_list).most_common(1)[0][0])

# Create a new SVM classifier using the common hyperparameters
svc_best = SVC(**common_params)

# Fit the new classifier to the whole training data
svc_best.fit(X_train_tfidf, y_train)

In [None]:
# Calculate the score of svc_best on the test set
score = svc_best.score(X_test_tfidf, y_test)

In [18]:
# Predict the target labels for the test data using the best-trained support vector classifier
y_pred = svc_best.predict(X_test_tfidf)

In [None]:
# Calculate the F1 score for the predicted labels compared to the true labels 
# The average parameter is set to 'macro' to compute the F1 score for each class independently
f1_score(y_test, y_pred, average='macro')

In [1]:
# Calculate the confusion matrix for the predicted labels compared to the true labels 
matrx = confusion_matrix(y_test, y_pred)

# Create a ConfusionMatrixDisplay object with the confusion matrix
# Specify the display labels as a list of range(4) to represent the class labels
disp = ConfusionMatrixDisplay(matrx, display_labels=list(range(4)))

# Plot the confusion matrix using the ConfusionMatrixDisplay object
disp.plot()