In [7]:
import numpy as np
import pandas as pd
from main import sskpy

prvi = "This is a very long string, just to test how fast this implementation " \
        "oook like ttime, unless you're" \
        " running this in a potato pc"
drugi="dfgbs sfhsbgs vsjh"
a=0.8
rez=sskpy(prvi, prvi, 9,a )
print(rez)

175922.15625


# Auxiliary modules

In [6]:
import pandas as pd
import numpy as np

## Auxiliary computing functions

In [14]:
def similarity_matrix(first_dataset, second_dataset, similarity_function, symmetrical=False):
    """
    Calculate the similarity matrix between elements of two datasets using a similarity function.

    Args:
    - first_dataset: List or array-like, the first dataset
    - second_dataset: List or array-like, the second dataset
    - similarity_function: Function, the similarity function that takes two elements as arguments

    Returns:
    - similarity_matrix: NumPy ndarray, the similarity matrix
    """
    if(symmetrical):
        size = len(first_dataset)
        similarity_matrix = [[0.0] * size for _ in range(size)]

        for i in range(size):
            for j in range(i, size):
                value =similarity_function(first_dataset[i], second_dataset[j])
                similarity_matrix[i][j] = value
                similarity_matrix[j][i] = value 
    else:
        similarity_matrix = [[similarity_function(x, y) for y in second_dataset] for x in first_dataset]
    
    return similarity_matrix

In [15]:
from main import sskpy
def ssk(a, b, k, lambd):
    return sskpy(a,b,k,lambd)

def ssk_scaled(a, b, k, lambd):    #accounts for document length
    len_a=len(a)
    len_b=len(b)
    return sskpy(a,b,k,lambd)/(max(len_a, len_b))

In [16]:
def ssk_partial(ka, Lambda):
    return lambda a,b: ssk(a,b,k=ka, lambd=Lambda)

def ssk_scaled_partial(ka, Lambda):
    return lambda a,b: ssk_scaled(a,b,k=ka, lambd=Lambda)

## Kernel runs definition

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support


def WK_SVM(X_train, y_train, X_test, y_test):  #a multi-class classifier

    '''calculates f1, precision, and recall for a SVM classifer using linear tfidf mapping
    Args:
    - X_train,y_train, X_test, y_test
    Returns:
    - f1, precision, recall: for each of the classes in form of a pandas dataframe w columns Kernel, Class, F1, Precision , Recall
    '''
    vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True, smooth_idf=True, norm='l2',
                                 analyzer='word', stop_words='english')

    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    svm_classifier = SVC(kernel='linear')
    svm_classifier.fit(X_train_tfidf, y_train)

    y_pred = svm_classifier.predict(X_test_tfidf)

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
    results_df = pd.DataFrame({
    'Kernel' : 'WK',
    'Class': range(len(precision)),
    'F1-Score': f1,
    'Precision': precision,
    'Recall': recall,
    })
    return results_df


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

def NGK_SVM(X_train, y_train, X_test, y_test, k):
    '''calculates f1, precision, and recall for a SVM classifer using linear n-gram mapping
    Args:
    - X_train,y_train, X_test, y_test, k=lenthg of the ngrams
    Returns:
    - f1, precision, recall: for each of the classes in form of a pandas dataframe w columns Kernel,k, Class, F1, Precision , Recall
    '''
    ngram_range = (k, k)
    vectorizer = CountVectorizer(analyzer='char', ngram_range=ngram_range)

    x_train_ngrams = normalize(vectorizer.fit_transform(X_train), norm='l2')  #ngram vectors normalised to l2 norm
    x_test_ngrams = normalize(vectorizer.transform(X_test), norm='l2')

    svm_classifier = SVC(kernel='linear')
    svm_classifier.fit(x_train_ngrams, y_train)

    y_pred = svm_classifier.predict(x_test_ngrams)

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)

    results_df = pd.DataFrame({
        'Kernel':'NGK',
        'k':k,
        'Class': range(len(precision)),
        'F1-Score': f1,
        'Precision': precision,
        'Recall': recall
        })
    return results_df

In [12]:
def SSK_SCALED_SVM(X_train, y_train, X_test, y_test, k, lambd ):
    '''calculates f1, precision, and recall for a SVM classifer using SSK
    Args:
    - X_train,y_train, X_test, y_test, k=lenthg of the substrings, lambd= weight decay factor
    Returns:
    - f1, precision, recall: for each of the classes in form of a pandas dataframe w columns Kernel,k,lambd, Class, F1, Precision , Recall
    '''
    # calculate gram matrix for training and matrix for prediction
    kernel_function=ssk_scaled_partial(k, lambd)
    train_matrix=similarity_matrix(X_train, X_train, kernel_function, symmetrical=True)
    test_matrix=similarity_matrix(X_test, X_train, kernel_function)

    #model - precomputed, trained on the gram matrix
    svm_model=SVC(kernel='precomputed')
    svm_model.fit(train_matrix, y_train)

    #predicting
    y_pred=svm_model.predict(test_matrix)

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)

    results_df = pd.DataFrame({
        'Kernel':'SSK',
        'k':k,
        'lambda': lambd,
        'Class': range(len(precision)),
        'F1-Score': f1,
        'Precision': precision,
        'Recall': recall
        })
    return results_df

# Creating datasets
- using 4 classes - 100 examples from each class = total dataset of 400 labeled documents
- splitting it into 5 train/test pairs in 80/20 ratio using StratifiedKFold


In [4]:
data=pd.read_csv('../data/preprocessed.csv')[['topics', 'body']].copy()
topic_mapping = {'earn': 0, 'acq': 1, 'crude': 2, 'grain': 3}
data['topics'] = data['topics'].map(topic_mapping)

N_classes=4
n_per_class=100

class0df=data[data.topics==0].head(n_per_class)  #earn
class1df=data[data.topics==1].head(n_per_class)  #acq
class2df=data[data.topics==2].head(n_per_class)  #crude
class3df=data[data.topics==3].head(n_per_class)  #grain

final=pd.concat([class0df, class1df, class2df, class3df][:N_classes])
print(len(final))

400


In [8]:
from sklearn.model_selection import StratifiedKFold

X_all=np.array(final['body'])
y_all=np.array(final['topics'])

n_folds = 5
stratified_kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
datasets = []

for train_index, test_index in stratified_kfold.split(X_all,y_all):
    X_train, X_test = X_all[train_index], X_all[test_index]
    y_train, y_test = y_all[train_index], y_all[test_index]
    
    datasets.append((X_train, X_test, y_train, y_test))

# datasets[i] = Xtrain_ i , Xtest_i, ytrain_i, ytest_i, i=1,...,10 aka the index of the run

## Repeating experiments

## Experiment 1 
- varying k but using the scaled version of the kernel function
- keeping lambda at 0.5

In [17]:
lambda_default=0.5
kvalues=[3,4,5,6,7,8,10,12]

experiment_results=[]

for i in range(len(datasets)):
    Xtrain, Xtest, ytrain, ytest= datasets[i]

    wk_results=WK_SVM(Xtrain,ytrain, Xtest,ytest)
    experiment_results.append(wk_results)

    for k in kvalues:
        ngk_results=NGK_SVM(Xtrain,ytrain, Xtest,ytest,k)
        ssk_results=SSK_SCALED_SVM(Xtrain,ytrain, Xtest,ytest,k, lambda_default)
        experiment_results.append(ngk_results)
        experiment_results.append(ssk_results)

results1=pd.concat(experiment_results)


csv_filename=f"exp1_scaledssk_{n_folds}_iterations__{N_classes}_classes__{n_per_class}_in_each_class.csv"
results1.to_csv(f"../data/results/scaled_ssk/{csv_filename}", index=False)

## Experiment 2 
- varying lambda while keeping k constant at 5

In [None]:
k_default=5
lambdavalues=[0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.3, 0.5, 0.7]

exp2_results=[]

for i_exp2 in range(len(datasets)):
    Xtrainexp2, Xtestexp2, ytrainexp2, ytestexp2= datasets[i_exp2]

    wke2_results=WK_SVM(Xtrainexp2,ytrainexp2, Xtestexp2,ytestexp2)
    exp2_results.append(wke2_results)

    ngke2_results=NGK_SVM(Xtrainexp2,ytrainexp2, Xtestexp2,ytestexp2,k_default)
    exp2_results.append(ngke2_results)

    for l in lambdavalues:
        sske2_results=SSK_SCALED_SVM(Xtrainexp2,ytrainexp2, Xtestexp2,ytestexp2,k_default, l)
        exp2_results.append(sske2_results)


results2=pd.concat(exp2_results)


csv2_filename=f"exp2_scaledssk_{n_folds}_iterations__{N_classes}_classes__{n_per_class}_in_each_class.csv"
results2.to_csv(f"../data/results/scaled_ssk/{csv2_filename}", index=False)