# SVM - Climate Sentiment Multiclass Classification
## CS522 Project

### Dataset: 
https://www.kaggle.com/code/luiskalckstein/climate-sentiment-multiclass-classification

### Imports

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import LinearSVC
from Common.DataCenter import data_center
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# When use pre-trained model
from gensim.models.keyedvectors import KeyedVectors
# When use climate corpus trained model
from gensim.models import Word2Vec
import os
import numpy as np

%matplotlib inline

### Configurations

In [2]:
# Select the method of vectorization
# enumeration: word2vec, tfidf
vectorization_method = 'tfidf'

# Select the corpus of word2vec
# enumeration: google, glove, climate
corpus_type = 'climate'

# Set the length of vector
# NOTICE: the dimension of vector must corresponding to the pre-trained model
vector_size = 300

# Choose vectorization method and pre-trained model
# Climate Change Tweets, GoogleNews, Twitter
# NOTICE: the dimension of selected file must corresponding to the vector_size
if vectorization_method == 'word2vec':
    if corpus_type == 'climate':
        # Load climate pre-trained word2vec model with 300,100,50 dimension
        wv_model_name = 'climates.41k.300d.bin'
        wv_model = Word2Vec.load(os.path.join('.', 'models', wv_model_name))
    elif corpus_type == 'google':
        # Load GoogleNews pre-trained word2vec model with 300 dimension
        wv_model_name = 'GoogleNews-vectors-negative300.bin'
        wv_model = KeyedVectors.load_word2vec_format(os.path.join('.', 'models', wv_model_name), binary=True)
    elif corpus_type == 'glove':
        # Load twitter pre-trained glove model with 200,100,50,25 dimension
        wv_model_name = 'glove.twitter.27B.200d.w2v.txt'
        wv_model = KeyedVectors.load_word2vec_format(os.path.join('.', 'glove', wv_model_name), binary=False)

### Text preprocessing

In [3]:
# parameter: list of tweet messages
# return: normalization of pre-processing
def normalize_preprocessing(data, split=False):
    
    messages = []
    
    # Traversal the message list
    for i in range(len(data)):
        # Lower case
        message         = data[i].lower()
        
        # Remove punctuation
        for c in string.punctuation:
            message         = message.replace(c, ' ')
            
        # Tokenize
        message         = nltk.word_tokenize(message)
        
        # Comment this part for no working
        ## Remove stop words
        # message_filtered = [w for w in message if w not in stopwords.words('english')]
        
        # Comment this part for no working
        ## Only keep Noun and specified POS
        # message_refiltered = nltk.pos_tag(message_filtered)
        # message_filtered = [w for w, pos in message_refiltered if pos.startswith('NN')]
        
        # Stemming
        ps               = PorterStemmer()
        # message_filtered = [ps.stem(w) for w in message_filtered]
        message_filtered = [ps.stem(w) for w in message]
        
        # Re-Combinate
        if split == True:
            message      = message_filtered
        else:
            message      = " ".join(message_filtered)

        messages.append(message)
        
    return messages

In [4]:
# parameter: list of normalized tweets
# return: embedding sentence vector with mean of Word2Vec
def w2vVectorizer(messages, size=vector_size):
    
    # Convert to vectors with pre-trained word2vec
    length = len(messages)
    array = np.zeros((length, size))
    message_count = 0
    
    for message in messages:
        word_count = 0
        for word in message:
            try:
                if corpus_type == 'climate':
                    # When vector of word2vec
                    array[message_count,] += wv_model.wv[word]
                elif corpus_type == 'google':
                    # When keyedvector of word2vec
                    array[message_count,] += wv_model[word]
                elif corpus_type == 'glove':
                    # When keyedvector of word2vec
                    array[message_count,] += wv_model[word]
                    
                word_count += 1
            except KeyError:
                continue
                
        if word_count != 0:
            array[message_count,] /= word_count
            
        message_count +=1
    
    return array

In [5]:
# parameter: original X of training set and test set
# return:  vectorised X of training set and test set
def text_preprocessing(X_train, X_test):
    
    # preprocessing with traditional NLP methodology
    split   = False
    if vectorization_method == 'word2vec':
        split = True
    X_train_normalized = normalize_preprocessing(X_train, split)
    X_test_normalized  = normalize_preprocessing(X_test, split)
    
    # vectorization
    if vectorization_method == 'word2vec':
        # Convert texts to vectors by Word2Vec
        X_train_vec  = w2vVectorizer(X_train_normalized)
        X_test_vec   = w2vVectorizer(X_test_normalized)
    else:
        # Convert texts to vectors by TFIDF
        vectorizer   = TfidfVectorizer()
        X_train_vec  = vectorizer.fit_transform(X_train_normalized)
        X_test_vec   = vectorizer.transform(X_test_normalized)
      
    return X_train_vec, X_test_vec

### One-hot encoding, convert the labels to vectors (4 x 1) each

In [6]:
# parameter: original y of training set, original y of test set
# return:  encoded y of training set and test set
def one_hot_encoding(y_train, y_test):
    mlb          = MultiLabelBinarizer()
    y_train_vec  = mlb.fit_transform(map(str, y_train))
    y_test_vec   = mlb.transform(map(str, y_test))
    return y_train_vec, y_test_vec


### Run SVM and evaluate the results

In [7]:
# parameter:  vectorised X and encoded y of training set and test set
def evaluate_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec):
    # Run SVM - fit and predict
    SVM             = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
    SVM.fit(X_train_vec, y_train_vec)
    prediction      = SVM.predict(X_test_vec)

    # Evaluate the results
    macro_f1        = f1_score(y_test_vec, prediction, average='macro')
    weighted_f1     = f1_score(y_test_vec, prediction, average='weighted')
    macro_precision = precision_score(y_test_vec, prediction, average='macro')
    macro_recall    = recall_score(y_test_vec, prediction, average='macro')

    return macro_f1, weighted_f1, macro_precision, macro_recall


### Do an experiment

In [8]:
# Parameter: original X,y of training set and test set
def do_experiment(X_train, y_train, X_test, y_test):
    # Convert texts to vectors
    X_train_vec, X_test_vec = text_preprocessing(X_train, X_test)
    y_train_vec, y_test_vec = one_hot_encoding(y_train, y_test)
    
    # Run SVM and evaluate the results
    macro_f1, weighted_f1, macro_precision, macro_recall = \
        evaluate_SVM(X_train_vec, y_train_vec, X_test_vec, y_test_vec)

    # Show the indicators
    print(" macro_f1: %.4f , weighted_f1: %.4f, macro_precision: %.4f, macro_recall: %.4f" %
          (macro_f1, weighted_f1, macro_precision, macro_recall))


### Main entry

**Load the database and split it into training set, test set, noisy set, validation set**

In [9]:
dc = data_center("twitter_sentiment_data_clean.csv", test_size=8000, noisy_size=8000, validation_size=5000)

print("####################################################")
print("Total data size: ",       dc.get_len())
print("Total train data size: ", dc.get_train_len())
print("Total test data size: ",  dc.get_test_len())

####################################################
Total data size:  40908
Total train data size:  19908
Total test data size:  8000


**Get the test set for evaluation**

In [10]:
X_test, y_test = dc.get_test()


**Run experiments with different training sets, and use the same test set.**

In [11]:
print("-----------------------------------------------")
for size in [2000, 2500, 4000, 5000, 7500, 10000]:
    # Get a training set without noisy data
    X_train, y_train = dc.get_train(size)
    print("Training set size: %d samples (%.1f%%): " % (len(X_train), len(y_train)/dc.get_train_len()*100))

    # Do an experiment
    do_experiment(X_train, y_train, X_test, y_test)

print("-----------------------------------------------")
for size in [(2000, 500), (4000, 1000), (7500, 2500)]:
    # Get a noisy training set
    X_train, y_train = dc.get_train_with_noisy(size[0], size[1])
    print("Noisy training set size: %d samples (%d original, %d noisy)" % (len(y_train), size[0], size[1]))

    # Do an experiment
    do_experiment(X_train, y_train, X_test, y_test)

-----------------------------------------------
Training set size: 2000 samples (10.0%): 
 macro_f1: 0.4725 , weighted_f1: 0.5717, macro_precision: 0.6499, macro_recall: 0.4261
Training set size: 2500 samples (12.6%): 
 macro_f1: 0.4917 , weighted_f1: 0.5840, macro_precision: 0.6557, macro_recall: 0.4438
Training set size: 4000 samples (20.1%): 
 macro_f1: 0.5298 , weighted_f1: 0.6135, macro_precision: 0.6753, macro_recall: 0.4813
Training set size: 5000 samples (25.1%): 
 macro_f1: 0.5446 , weighted_f1: 0.6239, macro_precision: 0.6826, macro_recall: 0.4941
Training set size: 7500 samples (37.7%): 
 macro_f1: 0.5661 , weighted_f1: 0.6387, macro_precision: 0.6876, macro_recall: 0.5176
Training set size: 10000 samples (50.2%): 
 macro_f1: 0.5856 , weighted_f1: 0.6531, macro_precision: 0.7033, macro_recall: 0.5374
-----------------------------------------------
Noisy training set size: 2499 samples (2000 original, 500 noisy)
 macro_f1: 0.4068 , weighted_f1: 0.5031, macro_precision: 0.5855