# SVM - Climate Sentiment Multiclass Classification
## CS522 Project
SVM with Count Vectorizer

### Dataset: 
https://www.kaggle.com/code/luiskalckstein/climate-sentiment-multiclass-classification

### Imports

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import LinearSVC,SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from Common.DataCenter import data_center
from Common.LSI import SKLearnLSA
from Common.UtilFuncs import DataSize

TrainSizeBaseLine = DataSize.GetTrainSizeBaseline()
TrainSizeWithNoisyData = DataSize.GetTrainSizeWithNoisyData()
TestDataSize = DataSize.GetTestDataSize()
NoiseDataSize = DataSize.GetNoiseDataSize()
ValidationDataSize = DataSize.GetValidationDataSize()

%matplotlib inline
try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime

time: 0 ns (started: 2022-04-08 18:28:58 +08:00)


### Text preprocessing

In [2]:
# parameter: original X of training set and test set
# return:  vectorised X of training set and test set
def text_preprocessing(X_train, X_test):
    # Convert texts to vectors
    vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    X_train_vec = X_train_vec.astype(float)
    X_test_vec = X_test_vec.astype(float)


    return X_train_vec, X_test_vec


time: 16 ms (started: 2022-04-08 18:28:58 +08:00)


### One-hot encoding, convert the labels to vectors (4 x 1) each

In [3]:
# parameter: original y of training set, original y of test set
# return:  encoded y of training set and test set
def one_hot_encoding(y_train, y_test):
    mlb          = MultiLabelBinarizer()
    y_train_vec  = mlb.fit_transform(map(str, y_train))
    y_test_vec   = mlb.transform(map(str, y_test))
    return y_train_vec, y_test_vec


time: 31 ms (started: 2022-04-08 18:28:58 +08:00)


### Run SVM and evaluate the results

In [4]:
# parameter:  vectorised X and encoded y of training set and test set
def evaluate_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec):
    # Run SVM - fit and predict
    SVM             = OneVsRestClassifier(LinearSVC(dual=False, class_weight='balanced'), n_jobs=-1)
    #SVM = OneVsRestClassifier(SVC(gamma='auto'), n_jobs=-1)
    SVM.fit(X_train_vec, y_train_vec)
    prediction      = SVM.predict(X_test_vec)

    # Evaluate the results
    macro_f1        = f1_score(y_test_vec, prediction, average='macro')
    weighted_f1     = f1_score(y_test_vec, prediction, average='weighted')
    macro_precision = precision_score(y_test_vec, prediction, average='macro')
    macro_recall    = recall_score(y_test_vec, prediction, average='macro')

    return macro_f1, weighted_f1, macro_precision, macro_recall


time: 16 ms (started: 2022-04-08 18:28:58 +08:00)


### Do an experiment

In [5]:
# Parameter: original X,y of training set and test set
def do_experiment(X_train, y_train, X_test, y_test):
    # Convert texts to vectors
    X_train_vec, X_test_vec = text_preprocessing(X_train, X_test)
    y_train_vec, y_test_vec = one_hot_encoding(y_train, y_test)

    # Run SVM and evaluate the results
    macro_f1, weighted_f1, macro_precision, macro_recall = \
        evaluate_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec)

    # Show the indicators
    print(" macro_f1: %.4f , weighted_f1: %.4f, macro_precision: %.4f, macro_recall: %.4f" %
          (macro_f1, weighted_f1, macro_precision, macro_recall))
    return X_train_vec


time: 15 ms (started: 2022-04-08 18:28:58 +08:00)


### Main entry

**Load the database and split it into training set, test set, noisy set, validation set**

In [6]:
dc = data_center("twitter_sentiment_data_clean.csv", test_size=TestDataSize, noisy_size=NoiseDataSize, validation_size=ValidationDataSize)

print("####################################################")
print("Total data size: ",       dc.get_len())
print("Total train data size: ", dc.get_train_len())
print("Total test data size: ",  dc.get_test_len())

####################################################
Total data size:  40908
Total train data size:  32908
Total test data size:  4000
time: 172 ms (started: 2022-04-08 18:28:58 +08:00)


**Get the test set for evaluation**

In [7]:
X_test, y_test = dc.get_test()

time: 15 ms (started: 2022-04-08 18:28:58 +08:00)


**Run experiments with different training sets, and use the same test set.**

In [8]:
print("-----------------------------------------------")
for size in TrainSizeBaseLine:
    # Get a training set without noisy data
    X_train, y_train = dc.get_train(size)
    print("Training set size: %d samples (%.1f%%): " % (len(X_train), len(y_train)/dc.get_train_len()*100))

    # Do an experiment
    do_experiment(X_train, y_train, X_test, y_test)

print("-----------------------------------------------")
xtrainvec = None
for size in TrainSizeWithNoisyData:
    # Get a noisy training set
    X_train, y_train = dc.get_train_with_noisy(size[0], size[1])
    print("Noisy training set size: %d samples (%d original, %d noisy)" % (len(y_train), size[0], size[1]))

    # Do an experiment
    xtrainvec = do_experiment(X_train, y_train, X_test, y_test)

-----------------------------------------------
Training set size: 3000 samples (9.1%): 
 macro_f1: 0.4973 , weighted_f1: 0.5754, macro_precision: 0.5986, macro_recall: 0.4447
Training set size: 6000 samples (18.2%): 
 macro_f1: 0.5495 , weighted_f1: 0.6169, macro_precision: 0.6230, macro_recall: 0.5048
Training set size: 7500 samples (22.8%): 
 macro_f1: 0.5635 , weighted_f1: 0.6251, macro_precision: 0.6288, macro_recall: 0.5220
Training set size: 12000 samples (36.5%): 
 macro_f1: 0.5850 , weighted_f1: 0.6492, macro_precision: 0.6303, macro_recall: 0.5534
Training set size: 15000 samples (45.6%): 
 macro_f1: 0.6013 , weighted_f1: 0.6579, macro_precision: 0.6445, macro_recall: 0.5691
Training set size: 22499 samples (68.4%): 
 macro_f1: 0.6176 , weighted_f1: 0.6742, macro_precision: 0.6409, macro_recall: 0.5995
Training set size: 30000 samples (91.2%): 
 macro_f1: 0.6305 , weighted_f1: 0.6828, macro_precision: 0.6431, macro_recall: 0.6201
----------------------------------------------

In [9]:
X_train, y_train = dc.get_train(2000)
# Convert texts to vectors
X_train_vec, X_test_vec = text_preprocessing(X_train, X_test)
y_train_vec, y_test_vec = one_hot_encoding(y_train, y_test)

# Run SVM and evaluate the results
macro_f1, weighted_f1, macro_precision, macro_recall = \
    evaluate_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec)

# Show the indicators
print(" macro_f1: %.4f , weighted_f1: %.4f, macro_precision: %.4f, macro_recall: %.4f" %
        (macro_f1, weighted_f1, macro_precision, macro_recall))

 macro_f1: 0.4634 , weighted_f1: 0.5491, macro_precision: 0.5760, macro_recall: 0.4096
time: 140 ms (started: 2022-04-08 18:29:16 +08:00)


In [10]:
import pandas as pd
df = pd.DataFrame([1,2,3])

time: 0 ns (started: 2022-04-08 18:29:16 +08:00)
