# SVM - Climate Sentiment Multiclass Classification
## CS522 Project

### Dataset: 
https://www.kaggle.com/code/luiskalckstein/climate-sentiment-multiclass-classification

### Imports

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from Common.DataCenter import data_center
from Common.preprocessor import normalize_preprocessing
from Common.UtilFuncs import print_evaluation, print_distribution

%matplotlib inline

### Text preprocessing

In [2]:
# parameter: original X of training set and test set
# return:  vectorised X of training set and test set
def text_preprocessing(X_train, X_test):
    
    # preprocessing with traditional NLP methodology
    X_train_normalized = normalize_preprocessing(X_train)
    X_test_normalized  = normalize_preprocessing(X_test)
    
    # vectorization
    # Convert texts to vectors by TFIDF
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    X_train_vec  = vectorizer.fit_transform(X_train_normalized)
    X_test_vec   = vectorizer.transform(X_test_normalized)
      
    return X_train_vec, X_test_vec

### One-hot encoding, convert the labels to vectors (4 x 1) each

In [3]:
# parameter: original y of training set, original y of test set
# return:  encoded y of training set and test set
def one_hot_encoding(y_train, y_test):
    mlb          = MultiLabelBinarizer()
    y_train_vec  = mlb.fit_transform(map(str, y_train))
    y_test_vec   = mlb.transform(map(str, y_test))
    return y_train_vec, y_test_vec


### Run SVM

In [4]:
# parameter:  vectorised X and encoded y of training set and test set
def run_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec):
    # Run SVM - fit and predict
    SVM             = OneVsRestClassifier(LinearSVC(dual=False, class_weight='balanced'), n_jobs=-1)
    SVM.fit(X_train_vec, y_train_vec)
    y_pred          = SVM.predict(X_test_vec)
    return  y_pred


### Do an experiment

In [5]:
# Parameter: original X,y of training set and test set
def do_experiment(X_train, y_train, X_test, y_test):
    # Convert texts to vectors
    X_train_vec, X_test_vec = text_preprocessing(X_train, X_test)
    y_train_vec, y_test_vec = one_hot_encoding(y_train, y_test)

    # Run SVM and evaluate the results
    y_pred = run_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec)

    # Print the evaluation
    print_evaluation(y_test_vec, y_pred, labels=[0,1,2,3])


### Main entry
**Load the database and split it into training set, test set, noisy set, validation set**

####################################################
Total data size:  40908
Total train data size:  33908
Total test data size:  4000


# The size of the noise sources

In [None]:

noisy_set_sizes = {
    'mislabeled' : 5000,   # max size: 15000
    'irrelevant' : 5000,   # max size: 34259
    'translated' : 5000,   # max size: 5000
}

# Load the database and split it into training set, test set, noisy set, validation set
dc = data_center("twitter_sentiment_data_clean.csv", test_size = 4000, validation_size = 1000,
                 noisy_size = noisy_set_sizes['mislabeled'])

print("####################################################")
print("Total data size: ",       dc.get_len())
print("Total train data size: ", dc.get_train_len())
print("Total test data size: ",  dc.get_test_len())

**Get the test set for evaluation**

In [8]:
X_test, y_test = dc.get_test()


**Set distributions of training set.**

In [9]:
# distribution of training set
train_distribution = None


**Prepare the noisy set.**

In [10]:
lstNoisyInfo = [("mislabeled",dc.get_noisy_len())]
print("Noisy set size is %d"                % dc.get_noisy_len())

# add the external noisy data (irrelevant texts)
# distribution of irrelevant noisy
irrelevant_noisy_distribution = [0.25, 0.25, 0.25, 0.25]    # None, if use the distribution of original set
added_size = dc.add_noisy(noisy_source="irrelevant", distribution = irrelevant_noisy_distribution,
                          size = noisy_set_sizes['irrelevant'])
print("%d noisy samples added" % added_size)
lstNoisyInfo.append(("irrelevant",added_size))

# add the external noisy data (translated texts). use the labels of each noisy data
added_size = dc.add_noisy(noisy_source="translated", distribution = "reserve_labels", 
                          size = noisy_set_sizes['translated'])
print("%d noisy samples added" % added_size)
lstNoisyInfo.append(("translated",added_size))

print("Noisy set new size is %d"                % dc.get_noisy_len())



Noisy set size is 3000
6000 noisy samples added
6000 noisy samples added
Noisy set new size is 15000


**Run experiments with different training sets, and use the same test set.**

In [11]:
print("-------------- No noisy training sets ----------")
for size in [2000, 4000, 5000, 8000, 10000, 15000, 20000]:
    # Get a training set without noisy data
    X_train, y_train = dc.get_train(size, train_distribution)
    print("* Training set size: %d samples: " % (len(X_train)))
    print_distribution("  Sentiment distribution", y_train)

    # Do an experiment
    do_experiment(X_train, y_train, X_test, y_test)

print("-------------- Noisy training sets -------------")
print("The proportions of the noise sources %s: " % [x[0] for x in lstNoisyInfo],
      [round(x[1]*100/dc.get_noisy_len(),1) for x in lstNoisyInfo])
for size in [(4000, 1000), (8000, 3000), (15000, 5000)]:
    # Get a noisy training set
    X_train, y_train = dc.get_train_with_noisy(size[0], size[1], train_distribution)
    print("* Noisy training set size: %d samples (%d original, %d noisy)" % (len(y_train), size[0], size[1]))
    print_distribution("  Sentiment distribution", y_train)

    # Do an experiment
    do_experiment(X_train, y_train, X_test, y_test)

-------------- No noisy training sets ----------
* Training set size: 2000 samples: 
  Sentiment distribution: 9.4%, 18.3%, 50.2%, 22.1%
  f1 of classes: [0.317, 0.325, 0.737, 0.679]
  micro_f1: 0.640 , macro_f1: 0.514 , weighted_f1: 0.609, macro_precision: 0.609, macro_recall: 0.504
* Training set size: 4000 samples: 
  Sentiment distribution: 9.4%, 50.2%, 18.3%, 22.1%
  f1 of classes: [0.367, 0.387, 0.75, 0.688]
  micro_f1: 0.659 , macro_f1: 0.548 , weighted_f1: 0.634, macro_precision: 0.629, macro_recall: 0.538
* Training set size: 5000 samples: 
  Sentiment distribution: 9.4%, 18.3%, 50.2%, 22.1%
  f1 of classes: [0.375, 0.394, 0.752, 0.702]
  micro_f1: 0.664 , macro_f1: 0.556 , weighted_f1: 0.640, macro_precision: 0.617, macro_recall: 0.551
* Training set size: 8000 samples: 
  Sentiment distribution: 9.4%, 18.3%, 50.2%, 22.1%
  f1 of classes: [0.447, 0.445, 0.773, 0.724]
  micro_f1: 0.690 , macro_f1: 0.597 , weighted_f1: 0.671, macro_precision: 0.642, macro_recall: 0.595
* Traini