# SVM - Climate Sentiment Multiclass Classification
## CS522 Project

**Dataset:**  
https://www.kaggle.com/code/luiskalckstein/climate-sentiment-multiclass-classification

**Imports**

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import LinearSVC
from Common.DataCenter import data_center
%matplotlib inline


## Text preprocessing

In [20]:
def text_preprocessing(X_train, X_test):
    # Convert texts to vectors
    vectorizer  = TfidfVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec  = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec


## One-hot encoding, convert the labels to vectors (4 x 1) each

In [21]:
def one_hot_encoding(y_train, y_test):
    mlb = MultiLabelBinarizer()
    y_train_vec  = mlb.fit_transform(map(str, y_train))
    y_test_vec   = mlb.transform(map(str, y_test))
    return y_train_vec, y_test_vec


## Run SVM and evaluate the results

In [22]:
def evaluate_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec):
    # Run SVM - fit and predict
    SVM = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
    SVM.fit(X_train_vec, y_train_vec)
    prediction = SVM.predict(X_test_vec)

    # Evaluate the results
    macro_f1 = f1_score(y_test_vec, prediction, average='macro')
    weighted_f1 = f1_score(y_test_vec, prediction, average='weighted')
    macro_precision = precision_score(y_test_vec, prediction, average='macro')
    macro_recall = recall_score(y_test_vec, prediction, average='macro')

    return macro_f1, weighted_f1, macro_precision, macro_recall


## Do an experiment

In [23]:
def do_experiment(X_train, y_train, X_test, y_test):
    # Convert texts to vectors
    X_train_vec, X_test_vec = text_preprocessing(X_train, X_test)
    y_train_vec, y_test_vec = one_hot_encoding(y_train, y_test)

    # Run SVM and evaluate the results
    macro_f1, weighted_f1, macro_precision, macro_recall = \
        evaluate_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec)

    # Show the indicators
    print(" macro_f1: %.4f , weighted_f1: %.4f, macro_precision: %.4f, macro_recall: %.4f" %
          (macro_f1, weighted_f1, macro_precision, macro_recall))


## Main entry

In [26]:
    dc = data_center("twitter_sentiment_data.csv", test_size=8000, noisy_size=8000) # sizes represented in absolute values

    print("####################################################")
    print("Total data size: ",       dc.get_len())
    print("Total train data size: ", dc.get_train_len())
    print("Total test data size: ",  dc.get_test_len())

####################################################
Total data size:  41033
Total train data size:  25033
Total test data size:  8000


### Load the database and split it into training set, test set, noisy set, validation set

### Get the test set for evaluation

In [24]:
    X_test, y_test = dc.get_test()


### Run experiments with different training set, and use the same test set.

In [25]:
    print("-----------------------------------------------")
    for size in [2000, 2500, 4000, 5000, 7500, 10000]:
        # Get training set without noisy data
        X_train, y_train = dc.get_train(size)
        print("Training set size: %d samples (%.1f%%): " % (len(X_train), len(y_train)/dc.get_train_len()*100))

        # Do experiment
        do_experiment(X_train, y_train, X_test, y_test)

    print("-----------------------------------------------")
    for size in [(2000, 500), (4000, 1000), (7500, 2500)]:
        # Get noisy training set
        X_train, y_train = dc.get_train_with_noisy(size[0], size[1])
        print("Noisy training set size: %d samples (%d original, %d noisy)" % (len(y_train), size[0], size[1]))

        # Do experiment
        do_experiment(X_train, y_train, X_test, y_test)

-----------------------------------------------
Training set size: 2000 samples (8.0%): 
 macro_f1: 0.4609 , weighted_f1: 0.5670, macro_precision: 0.6692, macro_recall: 0.4133
Training set size: 2500 samples (10.0%): 
 macro_f1: 0.4781 , weighted_f1: 0.5783, macro_precision: 0.6693, macro_recall: 0.4296
Training set size: 4000 samples (16.0%): 
 macro_f1: 0.5207 , weighted_f1: 0.6055, macro_precision: 0.6916, macro_recall: 0.4692
Training set size: 5000 samples (20.0%): 
 macro_f1: 0.5338 , weighted_f1: 0.6169, macro_precision: 0.6958, macro_recall: 0.4826
Training set size: 7500 samples (30.0%): 
 macro_f1: 0.5679 , weighted_f1: 0.6428, macro_precision: 0.7127, macro_recall: 0.5151
Training set size: 10000 samples (39.9%): 
 macro_f1: 0.5836 , weighted_f1: 0.6518, macro_precision: 0.7146, macro_recall: 0.5312
-----------------------------------------------
Noisy training set size: 2500 samples (2000 original, 500 noisy)
 macro_f1: 0.4155 , weighted_f1: 0.5121, macro_precision: 0.6028,