In [None]:
# import libraries

import os
import re
import pandas as pd
import numpy as np
import regex
import pickle
import datetime
from datetime import datetime
import itertools

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

# setting directory
os.chdir('K:\Specialemappe_XD1')

# load labelled sample
load_sample=pd.read_excel('labelled_sample.xlsx')
load_sample.columns

# identify any NaN values
load_sample.isnull().values.any()

# deleting 187 instances not recognized by educational institutions or with mixed classes
load_sample.drop(load_sample.loc[load_sample['instances_to_discard']=='delete'].index, inplace=True)

# data dropping drop useless columns
df = load_sample.drop(columns=['Unnamed: 0', 'index', 'instances_to_discard'])
df.columns

# count instances per classes
df['set_label'].value_counts()

# set interested classes with integer labels 
df.loc[df['set_label'] == 'dansk', 'set_label'] = 1
df.loc[df['set_label'] == 'idræt', 'set_label'] = 2
df.loc[df['set_label'] == 'matematik', 'set_label'] = 3
df.loc[df['set_label'] == 'temporary', 'set_label'] = 4
df.loc[df['set_label'] == 'other', 'set_label'] = 0

# set all other classes as the class '0'
df['set_label'] = np.where(
    (df['set_label'] !=1) & (df['set_label'] !=2) & (df['set_label'] !=3) & (df['set_label'] !=4), 0, df['set_label'])

# class imbalance 
df['set_label'].value_counts() 

# cleaning data
char_noise = r'[-_()\"#@;:`.''!?*´.:;,<>=+^Â/]'
digit_noise= r'[0123456789]'

df['name_activity']= df['name_activity'].apply(lambda x: re.sub(char_noise, ' ', x))
df['name_activity']= df['name_activity'].apply(lambda x: re.sub(digit_noise, '', x))

# # delete artficially created double whitespaces
df['name_activity']= df['name_activity'].apply(lambda x: x.replace('  ', ' '))
# # needed a 2nd time
df['name_activity']= df['name_activity'].apply(lambda x: x.replace('  ', ' '))


### preparing corpus for gensim implementation

In [None]:
### Create sentence list from labels

# hints from https://stackoverflow.com/q/60852962

list_activities = []

class my_corpus():
    
    def __init__(self, list_activities):
        self.list_pseudo_activities = list()
        self.run = 0

        for string in list_activities:
            list_pseudo_strings = list()
            
            for i in range(len(string)-1):
                list_pseudo_strings.append(string[i:i+2])
            
            self.list_pseudo_activities.append(list_pseudo_strings)
        
    def __iter__(self):
        now = datetime.now().strftime('%H:%M:%S')
        print(f'The data-iteration starts {self.run} at : {now}')
        print('------')
        
        for pseudo_strings in self.list_pseudo_activities:
            pseudo_abbreviations = pseudo_strings
            yield pseudo_abbreviations
        self.run += 1 

In [None]:
#%% hints from https://www.oreilly.com/library/view/fasttext-quick-start/9781789130997/f74fb462-a846-4569-af56-af9395eb2acf.xhtml
# hints from https://stackoverflow.com/a/54891714

# gensim implements callbacck parameter which takes sequence of subclasses of CallbackAny2Vec

class callback(CallbackAny2Vec):

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        epoch_end_time = datetime.now().strftime('%H:%M:%S')
        print(f'Epoch ends at : {epoch_end_time}')
        print('Loss after epoch {}: {}'.format(self.epoch, loss))
        self.epoch += 1

In [None]:
#%% Embedding with averaging 2-gram: the single instances are split in elements of 2 characters

def embedding_train(corpus_object):
    dict_pseudo_string = dict()
    
    for string in corpus_object:
        mean_vec_string = np.array([0]*w2v_train.vector_size, dtype='float32')
        count = 0      
        
        for i in range(len(string)-1):
            count += 1
            mean_vec_string += w2v_train.wv[string[i:i+2]] # splitting in 2 characters
        
        dict_pseudo_string[string] = mean_vec_string/count # averaging
        
    return dict_pseudo_string


def embedding_val(corpus_object):
    dict_pseudo_string = dict()
    
    for string in corpus_object:
        mean_vec_string = np.array([0]*w2v_val.vector_size, dtype='float32')
        count = 0      
        
        for i in range(len(string)-1):
            count += 1
            mean_vec_string += w2v_val.wv[string[i:i+2]]
        
        dict_pseudo_string[string] = mean_vec_string/count
        
    return dict_pseudo_string


def embedding_test(corpus_object):
    dict_pseudo_string = dict()
    
    for string in corpus_object:
        mean_vec_string = np.array([0]*w2v_test.vector_size, dtype='float32')
        count = 0      
        
        for i in range(len(string)-1):
            count += 1
            mean_vec_string += w2v_test.wv[string[i:i+2]]
        
        dict_pseudo_string[string] = mean_vec_string/count
        
    return dict_pseudo_string


def embedding_10samples(corpus_object):
    dict_pseudo_string = dict()
    
    for string in corpus_object:
        mean_vec_string = np.array([0]*w2v_10samples.vector_size, dtype='float32')
        count = 0      
        
        for i in range(len(string)-1):
            count += 1
            mean_vec_string += w2v_10samples.wv[string[i:i+2]]
        
        dict_pseudo_string[string] = mean_vec_string/count
        
    return dict_pseudo_string

### Preparing data and embedding with Word2Vec - Skip-gram

In [None]:
#%% splitting data in train, validation and test set

# function splitting data
def train_val_test(text,cutoffs=[0.8,0.9]):
    train = text[:int(len(text)*cutoffs[0])]
    val = text[int(len(text)*cutoffs[0]):int(len(text)*cutoffs[1])]
    test = text[int(len(text)*cutoffs[1]):]
    return train, val, test

train_content, val_content, test_content = train_val_test(df)
train_content['set_label'].value_counts()   #  >654 per class
val_content['set_label'].value_counts()     # >71 perr class
test_content['set_label'].value_counts()    # >85 per class

In [None]:
# train data : remove duplicates, 2-gram, embedding, new dataframe train
duplicates_in_train = train_content[train_content.duplicated(['name_activity'])]
train_set = train_content.apply(lambda x: x.astype(str).str.lower()).drop_duplicates(subset=['name_activity'], keep='first')
train_list_activities = train_set['name_activity'].to_list()
train_strings_ngram = my_corpus(train_list_activities)
# word2vec train data
w2v_train = Word2Vec(train_strings_ngram, vector_size= 5, window=5, min_count=1, workers=4, sg=1,
                  compute_loss=True, callbacks=[callback()])
# w2v_train.save('models/w2v_train_size20_w5_skipgram.model')

dict_pseudo_strings_w2v_train = embedding_train(train_list_activities)


# validation data : remove duplicates, 2-gram, embedding, new dataframe validation
duplicates_in_validation = val_content[val_content.duplicated(['name_activity'])]
val_set = val_content.apply(lambda x: x.astype(str).str.lower()).drop_duplicates(subset=['name_activity'], keep='first')
val_list_activities = val_set['name_activity'].to_list()
val_strings_ngram = my_corpus(val_list_activities)
# word2vec validation data
w2v_val = Word2Vec(val_strings_ngram, vector_size= 5, window=5, min_count=1, workers=4, sg=1,
                  compute_loss=True, callbacks=[callback()])
# w2v_val.save('models/w2v_val_size20_w5_skipgram.model')

dict_pseudo_strings_w2v_val = embedding_val(val_list_activities)


# test data : remove duplicates, 2-gram, embedding, new dataframe test
duplicates_in_test = test_content[test_content.duplicated(['name_activity'])]
test_set = test_content.apply(lambda x: x.astype(str).str.lower()).drop_duplicates(subset=['name_activity'], keep='first')
test_list_activities = test_set['name_activity'].to_list()
test_strings_ngram = my_corpus(test_list_activities)
# word2vec test data
w2v_test = Word2Vec(test_strings_ngram, vector_size= 5, window=5, min_count=1, workers=4, sg=1,
                  compute_loss=True, callbacks=[callback()])
# w2v_test.save('models/w2v_test_size20_w5_skipgram.model')

dict_pseudo_strings_w2v_test = embedding_test(test_list_activities)

# 10 samples data : remove duplicates, 2-gram, embedding, new dataframe 10 samples
duplicates_in_10samples = df10[df10.duplicated(['name_activity'])]
df10_set = df10.apply(lambda x: x.astype(str).str.lower()).drop_duplicates(subset=['name_activity'], keep='first')
df10_set_list_activities = df10_set['name_activity'].to_list()
df10_strings_ngram = my_corpus(df10_set_list_activities)
# word2vec train data
w2v_10samples = Word2Vec(df10_strings_ngram, vector_size= 5, window=5, min_count=1, workers=4, sg=1,
                  compute_loss=True, callbacks=[callback()])
# w2v_10samples.save('models/w2v_10samples_size20_w5_skipgram.model')

dict_pseudo_strings_w2v_10samples = embedding_10samples(df10_set_list_activities)

### Split

In [None]:
#%% preparing train set, validation set and test set

# dataframe of pseudo n-gram and embedding
train_pseudo_set =pd.DataFrame.from_dict(dict_pseudo_strings_w2v_train, orient="index")
val_pseudo_set =pd.DataFrame.from_dict(dict_pseudo_strings_w2v_val, orient="index")
test_pseudo_set =pd.DataFrame.from_dict(dict_pseudo_strings_w2v_test, orient="index")
samples_pseudo_set =pd.DataFrame.from_dict(dict_pseudo_strings_w2v_10samples, orient="index")

# NaN values present in all the sets
train_pseudo_set.isnull().sum().sum()
val_pseudo_set.isnull().sum().sum()
test_pseudo_set.isnull().sum().sum()
samples_pseudo_set.isnull().sum().sum()

# train set, validation set and test set
x_train = train_pseudo_set.fillna(train_pseudo_set.mean())
y_train = train_set['set_label']

x_val = val_pseudo_set.fillna(val_pseudo_set.mean())
y_val = val_set['set_label']

x_test = test_pseudo_set.fillna(test_pseudo_set.mean())
y_test = test_set['set_label']


### CLASSIFIER : KNN

In [None]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# # create KNN Object
model_knn = KNeighborsClassifier(algorithm = 'ball_tree', leaf_size =30, metric = 'euclidean', 
                                  n_jobs = 1, n_neighbors = 3, p = 1, weights = 'distance')


# training model
model_knn.fit(x_train, y_train)

# predict on train set
pred_knn_train = model_knn.predict(x_train)

# predict on validation set
pred_knn_val = model_knn.predict(x_val)

# predict on test set
pred_knn_test = model_knn.predict(x_test)

f1_knn_train = f1_score(y_train, pred_knn_train, average='macro')
print("KNN F1-Score train_set:", round(f1_knn_train, 4),"\n")
# print(classification_report(y_train, pred_knn_train))

f1_knn_val = f1_score(y_val, pred_knn_val, average='macro')
print("KNN F1-Score val_set:", round(f1_knn_val, 4),"\n")
print(classification_report(y_val, pred_knn_val))

f1_knn_test = f1_score(y_test, pred_knn_test, average='macro')
print("KNN F1-Score test_set:", round(f1_knn_test, 4),"\n")
print(classification_report(y_test, pred_knn_test))

In [None]:
# size 10 , window 5
# KNN F1-Score train_set: 0.8595 
# KNN F1-Score val_set: 0.1932 
# KNN F1-Score test_set: 0.1932 

# size 20 , window 5
# KNN F1-Score train_set: 0.8804 
# KNN F1-Score val_set: 0.1825 
# KNN F1-Score test_set: 0.1892 

# size 50, window 5 
# KNN F1-Score train_set: 0.8864 
# KNN F1-Score val_set: 0.1824 
# KNN F1-Score test_set: 0.1913 

# size 10 , window 3
# KNN F1-Score train_set: 0.8577 
# KNN F1-Score val_set: 0.1865 
# KNN F1-Score test_set: 0.2018 

# size 20 , window 3
# KNN F1-Score train_set: 0.8815 
# KNN F1-Score val_set: 0.2234  ##
# KNN F1-Score test_set: 0.1845 

# size 50, window 3
# KNN F1-Score train_set: 0.8867 
# KNN F1-Score val_set: 0.1838 
# KNN F1-Score test_set: 0.2381 


In [None]:
# grid search knn 

# grid search for knn - size 20 , window 3 
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()
n_k= [1,3,5,7,9,12,15,20, 25, 30]
alg = ['ball_tree', 'kd_tree']
weights_modality = ['uniform' , 'distance']
leaf_n = [30,40]
p_n = [1,2,3]
metric_type = ['euclidean', 'manhattan', 'mahalanobis']
j = [1]

param_grid = dict(n_neighbors = n_k, 
                  algorithm = alg,
                  weights = weights_modality,
                  leaf_size = leaf_n, 
                  p = p_n, 
                  metric = metric_type, 
                  n_jobs = j)
  
# defining parameter range
grid = GridSearchCV(knn, param_grid, cv=5, scoring='f1_macro', return_train_score=False,verbose=2)
  
# fitting the model for grid search
grid_search=grid.fit(x_train, y_train)

print(grid_search.best_params_)

store_grid_par = grid_search.best_params_
store_grid_score = grid_search.best_score_

print('Best leaf_size:', grid_search.best_estimator_.get_params()['leaf_size'])
print('Best p:', grid_search.best_estimator_.get_params()['p'])
print('Best n_neighbors:', grid_search.best_estimator_.get_params()['n_neighbors'])