# SVM - Climate Sentiment Multiclass Classification
## CS522 Project

### Dataset: 
https://www.kaggle.com/code/luiskalckstein/climate-sentiment-multiclass-classification

### Imports

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from Common.DataCenter import data_center
from Common.preprocessor import normalize_preprocessing
from Common.UtilFuncs import print_evaluation, print_distribution

%matplotlib inline

### Text preprocessing

In [12]:
# parameter: original X of training set and test set
# return:  vectorised X of training set and test set
def text_preprocessing(X_train, X_test):
    
    # preprocessing with traditional NLP methodology
    X_train_normalized = normalize_preprocessing(X_train)
    X_test_normalized  = normalize_preprocessing(X_test)
    
    # vectorization
    # Convert texts to vectors by TFIDF
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    X_train_vec  = vectorizer.fit_transform(X_train_normalized)
    X_test_vec   = vectorizer.transform(X_test_normalized)
      
    return X_train_vec, X_test_vec

### One-hot encoding, convert the labels to vectors (4 x 1) each

In [13]:
# parameter: original y of training set, original y of test set
# return:  encoded y of training set and test set
def one_hot_encoding(y_train, y_test):
    mlb          = MultiLabelBinarizer()
    y_train_vec  = mlb.fit_transform(map(str, y_train))
    y_test_vec   = mlb.transform(map(str, y_test))
    return y_train_vec, y_test_vec


### Run SVM

In [14]:
# parameter:  vectorised X and encoded y of training set and test set
def run_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec):
    # Run SVM - fit and predict
    SVM             = OneVsRestClassifier(LinearSVC(dual=False, class_weight='balanced'), n_jobs=-1)
    SVM.fit(X_train_vec, y_train_vec)
    y_pred          = SVM.predict(X_test_vec)
    return  y_pred


### Do an experiment

In [15]:
# Parameter: original X,y of training set and test set
def do_experiment(X_train, y_train, X_test, y_test):
    # Convert texts to vectors
    X_train_vec, X_test_vec = text_preprocessing(X_train, X_test)
    y_train_vec, y_test_vec = one_hot_encoding(y_train, y_test)

    # Run SVM and evaluate the results
    y_pred = run_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec)

    # Print the evaluation
    print_evaluation(y_test_vec, y_pred, labels=[0,1,2,3])


### Main entry
**Load the database and split it into training set, test set, noisy set, validation set**

# The size of the noise sources

In [16]:

noisy_set_sizes = {
    'mislabeled' : 5000,   # max size: 15000
    'irrelevant' : 5000,   # max size: 34259
    'translated' : 5000,   # max size: 5000
}

# Load the database and split it into training set, test set, noisy set, validation set
dc = data_center("twitter_sentiment_data_clean.csv", train_size = 20000, test_size = 4000, validation_size = 1000,
                 noisy_size = noisy_set_sizes['mislabeled'] if 'mislabeled' in noisy_set_sizes.keys() else 0)

print("####################################################")
print("Total data size: ",       dc.get_len())
print("Total train data size: ", dc.get_train_len())
print("Total test data size: ",  dc.get_test_len())

####################################################
Total data size:  40908
Total train data size:  20000
Total test data size:  4000


**Get the test set for evaluation**

In [17]:
X_test, y_test = dc.get_test()


**Set distributions of training set.**

In [18]:
# distribution of training set
train_distribution = None


**Prepare the noisy set.**

In [19]:
lstNoisyInfo    = []
if 'mislabeled' in noisy_set_sizes.keys() and noisy_set_sizes['mislabeled'] > 0:
    lstNoisyInfo.append(("mislabeled",dc.get_noisy_len()))
    print("%d noisy samples of '%s' added" % (dc.get_noisy_len(), 'mislabeled'))

# add the external noisy data (irrelevant texts)
if 'irrelevant' in noisy_set_sizes.keys() and noisy_set_sizes['irrelevant'] > 0:
    # distribution of irrelevant noisy
    irrelevant_noisy_distribution = [0.25, 0.25, 0.25, 0.25]    # None, if use the distribution of original set
    added_size = dc.add_noisy(noisy_source="irrelevant", distribution = irrelevant_noisy_distribution,
                              size = noisy_set_sizes['irrelevant'])
    print("%d noisy samples of '%s' added"  % (added_size, 'irrelevant'))
    lstNoisyInfo.append(("irrelevant",added_size))

# add the external noisy data (translated texts). use the labels of each noisy data
if 'translated' in noisy_set_sizes.keys() and noisy_set_sizes['translated'] > 0:
    added_size = dc.add_noisy(noisy_source="translated", distribution = "reserve_labels",
                              size = noisy_set_sizes['translated'])
    print("%d noisy samples of '%s' added"  % (added_size, 'translated'))
    lstNoisyInfo.append(("translated",added_size))

print("The total size of noisy data is %d"                % dc.get_noisy_len())


5000 noisy samples of 'mislabeled' added
5000 noisy samples of 'irrelevant' added
5000 noisy samples of 'translated' added
The total size of noisy data is 15000


**Run experiments with different training sets, and use the same test set.**

In [20]:
print("-------------- No noisy training sets ----------")
for size in [2000, 4000, 5000, 8000, 10000, 15000, 20000]:
    # Get a training set without noisy data
    X_train, y_train = dc.get_train(size, train_distribution)
    print("* Training set size: %d samples: " % (len(X_train)))
    print_distribution("  Sentiment distribution", y_train)

    # Do an experiment
    do_experiment(X_train, y_train, X_test, y_test)

print("-------------- Noisy training sets -------------")
print("The proportions of the noise sources %s: " % [x[0] for x in lstNoisyInfo],
      [round(x[1]*100/dc.get_noisy_len(),1) for x in lstNoisyInfo])
for size in [(4000, 1000), (8000, 2000), (15000, 5000)]:
    # Get a noisy training set
    X_train, y_train = dc.get_train_with_noisy(size[0], size[1], train_distribution)
    print("* Noisy training set size: %d samples (%d original, %d noisy)" % (len(y_train), size[0], size[1]))
    print_distribution("  Sentiment distribution", y_train)

    # Do an experiment
    do_experiment(X_train, y_train, X_test, y_test)

-------------- No noisy training sets ----------
* Training set size: 2000 samples: 
  Sentiment distribution: 9.4%, 18.3%, 50.2%, 22.1%
  f1 of classes: [0.349, 0.315, 0.711, 0.678]
  micro_f1: 0.626 , macro_f1: 0.513 , weighted_f1: 0.597, macro_precision: 0.592, macro_recall: 0.504
* Training set size: 4000 samples: 
  Sentiment distribution: 9.4%, 18.3%, 50.2%, 22.1%
  f1 of classes: [0.442, 0.371, 0.741, 0.695]
  micro_f1: 0.657 , macro_f1: 0.562 , weighted_f1: 0.635, macro_precision: 0.623, macro_recall: 0.554
* Training set size: 5000 samples: 
  Sentiment distribution: 9.4%, 18.3%, 50.2%, 22.1%
  f1 of classes: [0.466, 0.4, 0.746, 0.709]
  micro_f1: 0.668 , macro_f1: 0.580 , weighted_f1: 0.648, macro_precision: 0.636, macro_recall: 0.570
* Training set size: 8000 samples: 
  Sentiment distribution: 9.4%, 18.3%, 50.2%, 22.1%
  f1 of classes: [0.521, 0.42, 0.758, 0.719]
  micro_f1: 0.682 , macro_f1: 0.604 , weighted_f1: 0.665, macro_precision: 0.648, macro_recall: 0.598
* Training