# Results

## General

We're considering performace of an SVM classifier (OVR scheme) for 4 classes, using three different kernel functions. To train and test the SVM classifier we'll have to provide it with the similarity matrices.

Training - requires NxN Gram matrix (square matrix compromised of values of the kernel function between pairs training examples)

Testing - requires MxN matrix (element i,j is the value of the kernel function between i-th example of the testing set and j-th example of the training set)

N = cardinaliry of the training set, M = cardinality of the testing set

In [20]:
import pandas as pd
import numpy as np
import os

In [21]:
def similarity_matrix(first_dataset, second_dataset, similarity_function, symmetrical=False):
    """
    Calculate the similarity matrix between elements of two datasets using a similarity function.

    Args:
    - first_dataset: List or array-like, the first dataset
    - second_dataset: List or array-like, the second dataset
    - similarity_function: Function, the similarity function that takes two elements as arguments

    Returns:
    - similarity_matrix: NumPy ndarray, the similarity matrix
    """
    cores=os.cpu_count()
    if(symmetrical):
        size = len(first_dataset)
        similarity_matrix = [[0.0] * size for _ in range(size)]

        for i in range(size):
            for j in range(i, size):
                value =similarity_function(first_dataset[i], second_dataset[j])
                similarity_matrix[i][j] = value
                similarity_matrix[j][i] = value 
    else:
        similarity_matrix = [[similarity_function(x, y) for y in second_dataset] for x in first_dataset]
    
    return similarity_matrix

### Kernels

We're considering SSK, NGK and WK kernel functions, each with its own set of hiperparameters. 

SSK is parameterised by $k$ = length of the substrings used for feature mapping/kernel computation and $\lambda$= real numer from the interval [0,1] which indicates how much we penalise the noncontiguity of the appeared substring in the imput document. 

NGK is parameterised with $n$, corresponding to $k$ in SSk.

WK has no hiperparameters

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support


def WK_SVM(X_train, y_train, X_test, y_test):  #a multi-class classifier

    '''calculates f1, precision, and recall for a SVM classifer using linear tfidf mapping
    Args:
    - X_train,y_train, X_test, y_test
    Returns:
    - f1, precision, recall: for each of the classes in form of a pandas dataframe w columns Kernel, Class, F1, Precision , Recall
    '''
    vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True, smooth_idf=True, norm='l2',
                                 analyzer='word', stop_words='english')

    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    svm_classifier = SVC(kernel='linear')
    svm_classifier.fit(X_train_tfidf, y_train)

    y_pred = svm_classifier.predict(X_test_tfidf)

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
    results_df = pd.DataFrame({
    'Kernel' : 'WK',
    'Class': range(len(precision)),
    'F1-Score': f1,
    'Precision': precision,
    'Recall': recall,
    })
    return results_df


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

def NGK_SVM(X_train, y_train, X_test, y_test, k):
    '''calculates f1, precision, and recall for a SVM classifer using linear n-gram mapping
    Args:
    - X_train,y_train, X_test, y_test, k=lenthg of the ngrams
    Returns:
    - f1, precision, recall: for each of the classes in form of a pandas dataframe w columns Kernel,k, Class, F1, Precision , Recall
    '''
    ngram_range = (k, k)
    vectorizer = CountVectorizer(analyzer='char', ngram_range=ngram_range)

    x_train_ngrams = normalize(vectorizer.fit_transform(X_train), norm='l2')  #ngram vectors normalised to l2 norm
    x_test_ngrams = normalize(vectorizer.transform(X_test), norm='l2')

    svm_classifier = SVC(kernel='linear')
    svm_classifier.fit(x_train_ngrams, y_train)

    y_pred = svm_classifier.predict(x_test_ngrams)

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)

    results_df = pd.DataFrame({
        'Kernel':'NGK',
        'k':k,
        'Class': range(len(precision)),
        'F1-Score': f1,
        'Precision': precision,
        'Recall': recall
        })
    return results_df

In [5]:
from main import sskpy
def ssk(a, b, k, lambd):
    if(k==0 or lambd==0):
        return 0
    else:
        return sskpy(a,b,k,lambd)

In [6]:
def ssk_partial(ka, Lambda):
    return lambda a,b: ssk(a,b,k=ka, lambd=Lambda)

In [7]:
def SSK_SVM(X_train, y_train, X_test, y_test, k, lambd ):
    '''calculates f1, precision, and recall for a SVM classifer using SSK
    Args:
    - X_train,y_train, X_test, y_test, k=lenthg of the substrings, lambd= weight decay factor
    Returns:
    - f1, precision, recall: for each of the classes in form of a pandas dataframe w columns Kernel,k,lambd, Class, F1, Precision , Recall
    '''
    # calculate gram matrix for training and matrix for prediction
    kernel_function=ssk_partial(k, lambd)
    train_matrix=similarity_matrix(X_train, X_train, kernel_function, symmetrical=True)
    test_matrix=similarity_matrix(X_test, X_train, kernel_function)

    #model - precomputed, trained on the gram matrix
    svm_model=SVC(kernel='precomputed')
    svm_model.fit(train_matrix, y_train)

    #predicting
    y_pred=svm_model.predict(test_matrix)

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)

    results_df = pd.DataFrame({
        'Kernel':'SSK',
        'k':k,
        'lambda': lambd,
        'Class': range(len(precision)),
        'F1-Score': f1,
        'Precision': precision,
        'Recall': recall
        })
    return results_df

## Experiments

### 0.Creating datasets
each run has to be repeated 10 times - we need 10 pairs of (train, test) sets that need to satisfy the properties given in the paper:

 earn 152 (40); acq 114 (25); crude 76 (15); grain 38 (10)
 since not enough examples of corn I'll use grain as the 4th topic

In [16]:
data=pd.read_csv('../data/preprocessed.csv')[['topics', 'body']].copy()
topic_mapping = {'earn': 0, 'acq': 1, 'crude': 2, 'grain': 3}
data['topics'] = data['topics'].map(topic_mapping)

N_classes=4
n_per_class=100

class0df=data[data.topics==0].head(n_per_class)  #earn
class1df=data[data.topics==1].head(n_per_class)  #acq
class2df=data[data.topics==2].head(n_per_class)  #crude
class3df=data[data.topics==3].head(n_per_class)  #grain

final=pd.concat([class0df, class1df, class2df, class3df][:N_classes])
print(len(final))

400


In [17]:
from sklearn.model_selection import StratifiedKFold

X_all=np.array(final['body'])
y_all=np.array(final['topics'])

n_folds = 5
stratified_kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
datasets = []

for train_index, test_index in stratified_kfold.split(X_all,y_all):
    X_train, X_test = X_all[train_index], X_all[test_index]
    y_train, y_test = y_all[train_index], y_all[test_index]
    
    datasets.append((X_train, X_test, y_train, y_test))

# datasets[i] = Xtrain_ i , Xtest_i, ytrain_i, ytest_i, i=1,...,10 aka the index of the run

### Experiment 1 - Varying Subsequence Length in SSK and NGK

For this experimet we're keeping lambda for SSK at 0.5, and varying k/n for SSK/NGKin values [3,4,5,6,7,8,10,12], WK has no parameters

In [10]:
lambda_default=0.5
kvalues=[3,4,5,6,7,8,10,12]

experiment_results=[]

for i in range(len(datasets)):
    Xtrain, Xtest, ytrain, ytest= datasets[i]

    wk_results=WK_SVM(Xtrain,ytrain, Xtest,ytest)
    experiment_results.append(wk_results)

    for k in kvalues:
        ngk_results=NGK_SVM(Xtrain,ytrain, Xtest,ytest,k)
        ssk_results=SSK_SVM(Xtrain,ytrain, Xtest,ytest,k, lambda_default)
        experiment_results.append(ngk_results)
        experiment_results.append(ssk_results)

results1=pd.concat(experiment_results)


csv_filename=f"exp1_{n_folds}_iterations__{N_classes}_classes__{n_per_class}_in_each_class.csv"
results1.to_csv(f"../data/results/original_results/{csv_filename}", index=False)

### Experiment 2 - Varying Weight Decay Factor in SSK

For this experiment we're keeping k/n for SSK/NGK at 5 and varying lambda for SSK trough values [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.3, 0.5, 0.7]

In [11]:
k_default=5
lambdavalues=[0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.3, 0.5, 0.7]

exp2_results=[]

for i_exp2 in range(len(datasets)):
    Xtrainexp2, Xtestexp2, ytrainexp2, ytestexp2= datasets[i_exp2]

    wke2_results=WK_SVM(Xtrainexp2,ytrainexp2, Xtestexp2,ytestexp2)
    exp2_results.append(wke2_results)

    ngke2_results=NGK_SVM(Xtrainexp2,ytrainexp2, Xtestexp2,ytestexp2,k_default)
    exp2_results.append(ngke2_results)

    for l in lambdavalues:
        sske2_results=SSK_SVM(Xtrainexp2,ytrainexp2, Xtestexp2,ytestexp2,k_default, l)
        exp2_results.append(sske2_results)


results2=pd.concat(exp2_results)


csv2_filename=f"exp2_{n_folds}_iterations__{N_classes}_classes__{n_per_class}_in_each_class.csv"
results2.to_csv(f"../data/results/original_results/{csv2_filename}", index=False)

In [15]:
print(exp2_results)
print(results2)

[  Kernel  Class  F1-Score  Precision  Recall
0     WK      0  0.974359   1.000000    0.95
1     WK      1  0.952381   0.909091    1.00
2     WK      2  0.950000   0.950000    0.95
3     WK      3  0.974359   1.000000    0.95,   Kernel  k  Class  F1-Score  Precision  Recall
0    NGK  5      0  0.974359   1.000000    0.95
1    NGK  5      1  0.952381   0.909091    1.00
2    NGK  5      2  0.950000   0.950000    0.95
3    NGK  5      3  0.974359   1.000000    0.95,   Kernel  k  lambda  Class  F1-Score  Precision  Recall
0    SSK  5    0.01      0  0.950000   0.950000    0.95
1    SSK  5    0.01      1  0.517241   0.394737    0.75
2    SSK  5    0.01      2  0.514286   0.600000    0.45
3    SSK  5    0.01      3  0.296296   0.571429    0.20,   Kernel  k  lambda  Class  F1-Score  Precision  Recall
0    SSK  5    0.03      0  0.900000   0.900000    0.90
1    SSK  5    0.03      1  0.565217   0.500000    0.65
2    SSK  5    0.03      2  0.666667   0.750000    0.60
3    SSK  5    0.03      3 

### Experiment 3 - Combining two SSK of Different Substring Lengths

In this experiment we're examining the performance of the SVM classifier using a kernel that is computed as the sum of two SSK with differnt lenths k1 and k2. 

We're keeping labmda constant for both of those at 0.5 and looking at combinations of lengths in [(3,0),(4,0),(5,0),(6,0),(3,4),(3,5),(3,6)]

In [18]:
def SSK_k_comb_SVM(X_train, y_train, X_test, y_test, k1, k2,lambd ):
    '''calculates f1, precision, and recall for a SVM classifer using SSK
    Args:
    - X_train,y_train, X_test, y_test, k=lenthg of the substrings, lambd= weight decay factor
    Returns:
    - f1, precision, recall: for each of the classes in form of a pandas dataframe w columns Kernel,k,lambd, Class, F1, Precision , Recall
    '''
    # calculate gram matrix for training and matrix for prediction
    kernel1=ssk_partial(k1, lambd)
    kernel2=ssk_partial(k2, lambd)
    comb= lambda x,y: kernel1(x,y) + kernel2(x,y)
    train_matrix=similarity_matrix(X_train, X_train, comb, symmetrical=True)
    test_matrix=similarity_matrix(X_test, X_train, comb)

    #model - precomputed, trained on the gram matrix
    svm_model=SVC(kernel='precomputed')
    svm_model.fit(train_matrix, y_train)

    #predicting
    y_pred=svm_model.predict(test_matrix)

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)

    results_df = pd.DataFrame({
        'Kernel':'SSK k comb',
        'k':f"({k1}, {k2})",
        'lambda': lambd,
        'Class': range(len(precision)),
        'F1-Score': f1,
        'Precision': precision,
        'Recall': recall
        })
    return results_df

In [19]:
lambda_default=0.5
k1k2s=[(3,4),(3,5),(3,6),(3,7),(4,5),(4,6),(4,7)]
exp3_results=[]

for i3 in range(len(datasets)):
    Xtrain3, Xtest3, ytrain3, ytest3= datasets[i3]

    for k1,k2 in k1k2s:
        kcomb_results=SSK_k_comb_SVM(Xtrain3,ytrain3, Xtest3,ytest3,k1,k2, lambda_default)
        exp3_results.append(kcomb_results)

results3=pd.concat(exp3_results)


csv_filename3=f"exp3_{n_folds}_iterations__{N_classes}_classes__{n_per_class}_in_each_class.csv"
results3.to_csv(f"../data/results/original_results/{csv_filename3}", index=False)

### Experiment 4 - Combining SSK and NGK 

We're examining the performance of the weighted sum of NGK and SSK with same length.

Length of both kernels =5, lambda for SSK=0.5, varying the contibutions of NGK and SSK in[(1,0), (0,1), (0.5, 0.5), (0.6,0.4), (0.7, 0.3), (0.8, 0.2), (0.9, 0.1)]

In [22]:
from sklearn.preprocessing import normalize

def ngk(x,y, trained_vectoriser):
    x_ngrams = normalize(trained_vectoriser.transform([x]).toarray(), norm='l2')  
    y_ngrams = normalize(trained_vectoriser.transform([y]).toarray(), norm='l2')
    return np.dot(x_ngrams.flatten(), y_ngrams.flatten())


In [23]:
from sklearn.feature_extraction.text import CountVectorizer

def SSK_NGK_SVM(X_train, y_train, X_test, y_test, wngk,wssk,k, lambd ):
    # define kernel function based on given parameters

    sskkernel=ssk_partial(k, lambd)
    
    if(k !=0):                    
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(k,k))
        vectorizer.fit(X_train)
        ngkkernel=lambda x,y : ngk(x,y,vectorizer)
    else:
        ngkkernel=lambda x,y : 0

    comb= lambda x,y: wssk*sskkernel(x,y) + wngk* ngkkernel(x,y)

    # calculate gram matrix for training and matrix for prediction
    train_matrix=similarity_matrix(X_train, X_train, comb, symmetrical=True)
    test_matrix=similarity_matrix(X_test, X_train, comb)

    #model - precomputed, trained on the gram matrix
    svm_model=SVC(kernel='precomputed')
    svm_model.fit(train_matrix, y_train)

    #predicting
    y_pred=svm_model.predict(test_matrix)

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)

    results_df = pd.DataFrame({
        'Kernel':'SSK NGK comb',
        'wngk, wssk':f"({wngk}, {wssk})",
        'k': k,
        'lambda': lambd,
        'Class': range(len(precision)),
        'F1-Score': f1,
        'Precision': precision,
        'Recall': recall
        })
    return results_df

In [24]:
k_default=5
lambda_default=0.5
wnwss=[(1,0), (0,1), (0.5, 0.5), (0.6,0.4), (0.7, 0.3), (0.8, 0.2), (0.9, 0.1)]
exp4_results=[]

for i4 in range(len(datasets)):
    Xtrain4, Xtest4, ytrain4, ytest4= datasets[i4]

    for wn,ws in wnwss:
        ssk_ngk_comb_results=SSK_NGK_SVM(Xtrain4,ytrain4, Xtest4, ytest4,wn, ws,k_default, lambda_default)
        exp4_results.append(ssk_ngk_comb_results)

results4=pd.concat(exp4_results)

csv_filename4=f"exp4_{n_folds}_iterations__{N_classes}_classes__{n_per_class}_in_each_class.csv"
results4.to_csv(f"../data/{csv_filename4}", index=False)

KeyboardInterrupt: 

### Experiment 5 - Combining two SSK of Different Weight Decay Factors

Here were using a kernel that is a sum of two SSK with different lambdas, and same k=5. 
The lambda values were varied trough [(0.05,0), (0.5,0), (0.05, 0.5)]

In [None]:
def SSK_lambd_comb_SVM(X_train, y_train, X_test, y_test, k, lambd1,lambd2 ):
    '''calculates f1, precision, and recall for a SVM classifer using SSK
    Args:
    - X_train,y_train, X_test, y_test, k=lenthg of the substrings, lambd= weight decay factor
    Returns:
    - f1, precision, recall: for each of the classes in form of a pandas dataframe w columns Kernel,k,lambd, Class, F1, Precision , Recall
    '''
    # calculate gram matrix for training and matrix for prediction
    kernel1=ssk_partial(k, lambd1)
    kernel2=ssk_partial(k, lambd2)
    comb= lambda x,y: kernel1(x,y) + kernel2(x,y)
    train_matrix=similarity_matrix(X_train, X_train, comb, symmetrical=True)
    test_matrix=similarity_matrix(X_test, X_train, comb)

    #model - precomputed, trained on the gram matrix
    svm_model=SVC(kernel='precomputed')
    svm_model.fit(train_matrix, y_train)

    #predicting
    y_pred=svm_model.predict(test_matrix)

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)

    results_df = pd.DataFrame({
        'Kernel':'SSK lambda comb',
        'k': k,
        'lambdas': f'{lambd1},{lambd2}',
        'Class': range(len(precision)),
        'F1-Score': f1,
        'Precision': precision,
        'Recall': recall
        })
    return results_df

In [None]:
k_default=5
l1l2s=[(0.05,0), (0.5,0), (0.05, 0.5)]

exp5_results=[]

for i in range(len(datasets)):
    Xtrain, Xtest, ytrain, ytest= datasets[i]

    for l1,l2 in l1l2s:
        lcomb_results=SSK_lambd_comb_SVM(Xtrain,ytrain, Xtest,ytest,k_default,l1,l2)
        exp5_results.append(lcomb_results)

results5=pd.concat(exp5_results)


csv_filename5=f"exp5_{n_folds}_iterations__{N_classes}_classes__{n_per_class}_in_each_class.csv"
results5.to_csv(f"../data/{csv_filename5}", index=False)

## Improvements

### Scaling the SSK score
- the ssk implementation used previously doesn't normalise the vectors, aka doesnt't take into account document length
- this results in ssk values being higher for longer documents, without actually having more similarity
- to tackle this we'll make a slight change to the kernel function, dividing the score with the length of the longer document

In [None]:
from main import sskpy
def ssk_scaled(a, b, k, lambd):    #accounts for document length
    len_a=len(a)
    len_b=len(b)
    return sskpy(a,b,k,lambd)/(max(len_a, len_b))

def ssk_scaled_partial(ka, Lambda):
    return lambda a,b: ssk_scaled(a,b,k=ka, lambd=Lambda)

Ponovit ćemo eksperimente 1 i 