In [None]:
# in prompt set NLTK in the environment and check its operativity
# set NLTK_DATA=H:\nltk_data

#import libraries
import os
import re
import pandas as pd
import numpy as np
import regex
import datetime
from datetime import datetime

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.metrics import ConfusionMatrix
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# setting directory
os.chdir('K:\Specialemappe_XD1')

# load labelled sample
load_sample=pd.read_excel('labelled_sample.xlsx')

# deleting instances not recognized by educational institutions or with mixed classes
load_sample.drop(load_sample.loc[load_sample['instances_to_discard']=='delete'].index, inplace=True)

# data dropping drop useless columns
df = load_sample.drop(columns=['Unnamed: 0', 'index', 'instances_to_discard'])

# set interested classes with integer labels 
df.loc[df['set_label'] == 'dansk', 'set_label'] = 1
df.loc[df['set_label'] == 'idræt', 'set_label'] = 2
df.loc[df['set_label'] == 'matematik', 'set_label'] = 3
df.loc[df['set_label'] == 'temporary', 'set_label'] = 4
df.loc[df['set_label'] == 'other', 'set_label'] = 0

# set all other classes to the class 'other'
df['set_label'] = np.where(
    (df['set_label'] !=1) & (df['set_label'] !=2) & (df['set_label'] !=3) & (df['set_label'] !=4), 0, df['set_label'])

# class imbalance 0 = 43674, 1 = 888, 2 = 810 , 3 = 950, 4 = 5491
df['set_label'].value_counts()

# cleaning data 
char_noise = r'[-_()\"#@;:`.''!?*´.:;,<>=+^Â/]' # - _ / Â are serious issues
df['name_activity']= df['name_activity'].apply(lambda x: re.sub(char_noise, ' ', x))

df['name_activity']= df['name_activity'].apply(lambda x: x.replace('  ', ' '))
# needed a 2nd time
df['name_activity']= df['name_activity'].apply(lambda x: x.replace('  ', ' '))


In [None]:
#%% generate word features

# data content
content = df['name_activity']

dansk_stopwords = set(stopwords.words('danish'))

cleaned_content_text=[]
for name in content:
    name = name.lower()                                      #lowercasing
    name = re.sub('[-_()\"#@;:`.''!?*´:;,<>=+^]', ' ', name) # substitute with whitespace to help tokenizing
    name = re.sub('[4578]', ' ', name)                      # substitute with whitespace to help tokenizing
    name = word_tokenize(name, language= 'danish')           #tokenize name activities
    for token in name:
        if token != ' ':
            if token != '':
                if token not in dansk_stopwords:
                    cleaned_content_text.append(token)

print('Sample of the output')
print(cleaned_content_text[100:150])

most_fq_words_cleaned = nltk.FreqDist(w for w in cleaned_content_text) 
word_features_cleaned = list(most_fq_words_cleaned)[:400]

#additional features
strings_rule = (
    'matematikkens', 'geogebra', 'ing', 'svø', 'vø', 'svøm', 'svømning',\
    'atletik', 'basket','kids', 'kidsvolley', 'volley', 'motion', 'motionsdag',\
    '0', 'bhk', 'børnhave', \
    'eks', 'dækkes','skal ikke', 'kørsel', 'studietur','tur', 'kommunale', 'fælleskommnunal'\
    'prøve', 'terminsprøve', 'fp9','pf9', 'test','eksam',\
    '16', '16d', '16b', '§16',\
    'skoleintro', 'praktik', 'sfo', 'planlægning',\
    'ff', 'fagteam','kursus', 'klassemøde', 'møde', 'netværksmøde', 'læringssamtale', 'fase', 'fasa',\
    'vej', 'vejleder','vejledning', 'læsevejledere', 'matematikvejleder',\
    'konference','webinar', 'studietur'\
    'vikar','vikartime', 'vikartimer',\
    'da','da2', 'som', 'dansk som','andet', 'dansk som andetsprog', 'dansk som andet sprog','dsa')

additional_features=list(strings_rule)

for i in additional_features:
    word_features_cleaned.append(i)
    print(len(word_features_cleaned))



In [None]:
#splitting data and datasets building

def train_val_test(text,cutoffs=[0.8,0.9]):
    train = text[:int(len(text)*cutoffs[0])]
    val = text[int(len(text)*cutoffs[0]):int(len(text)*cutoffs[1])]
    test = text[int(len(text)*cutoffs[1]):]
    return train, val, test

train_content, val_content, test_content = train_val_test(df)
train_content['set_label'].value_counts()   #  >775 per class
val_content['set_label'].value_counts()     # >89 perr class
test_content['set_label'].value_counts()    # >99 per class
 
#training set building TUPLE
train_data_tuple = (train_content['name_activity'], train_content['set_label']) #create dataframe tuple
set_training=[]
counter=0 #used for indexing
for _ in train_data_tuple[0]:
    set_training.append([(i.iloc[counter]) for i in train_data_tuple]) #append name and its label as a tuple
    counter+=1
    
#validation set building TUPLE
val_data_tuple = (val_content['name_activity'], val_content['set_label']) #create dataframe tuple
set_validation=[]
counter=0 #used for indexing
for _ in val_data_tuple[0]:
    set_validation.append([(i.iloc[counter]) for i in val_data_tuple]) #append name and its label as a tuple
    counter+=1

#test set building TUPLE
test_data_tuple=(test_content['name_activity'], test_content['set_label']) #create dataframe tuple
set_test=[]
counter=0 #used for indexing
for _ in test_data_tuple[0]:
    set_test.append([(i.iloc[counter]) for i in test_data_tuple]) #append name and its label as a tuple
    counter+=1

print("Training tuple length:", len(set_training))
print("Validation tuple length:", len(set_validation))
print("Test tuple length:", len(set_test))

def document_features(document, word_features):
        document_words = set(document)
        features = {}
        # for word in word_features :
        for word in word_features_cleaned :
            features['contains ({})'.format(word)] = (word in document_words)
        return features

train_set = [(document_features(word_tokenize(w.lower(), language='danish'),word_features_cleaned), label) 
             for w, label in set_training]# Training set - def document_feature on each string and pair with class

val_set = [(document_features(word_tokenize(w.lower(), language='danish'),word_features_cleaned), label) 
           for w, label in set_validation]# Validation set - def document_feature on each string and pair with class 

test_set = [(document_features(word_tokenize(w.lower(), language='danish'),word_features_cleaned), label) 
            for w, label in set_test]# Test set - def document_feature on each string and pair with class
    
print("Training set length:", len(train_set))
print("Validation set length:", len(val_set))
print("Test set length:", len(test_set))

### Grid search class weights with Logistic classifier

In [None]:
potential_weights = np.linspace(0.1,0.99,100)
grid_parameters = {'class_weight': [{0:x, 1:1.0-x, 2:1.0-x, 3:1.0-x, 4:1.0-x} for x in potential_weights]}

# tracking time
start_time_log = datetime.now()

# list to store the results
results_from_weights_log = []
counting = 0 
for weigths in grid_parameters.values():
    for parameters in weigths: 
        counting += 1     
        classifier = SklearnClassifier(LogisticRegression(random_state = 10,
                                                       multi_class='ovr',
                                                       penalty='l2',
                                                       class_weight=parameters)).train(train_set)
        y_train = [(y) for _, y in train_set]
        y_val = [(y) for _, y in val_set]
        pred_clf_train = [classifier.classify(p) for p, _ in train_set]
        pred_clf_val = [classifier.classify(p) for p, _ in val_set]
        scoring_train = f1_score(y_train, pred_clf_train, average='macro')
        scoring_val = f1_score(y_val, pred_clf_val, average='macro')
        print(counting)
        results_from_weights_log.append([parameters, scoring_train, scoring_val])
    
# dataframe to look better the results
results_dataframe_log = pd.DataFrame(results_from_weights_log, 
                                     columns = ["LOGweights", "F1-Score train", "F1-Score val"])

# time needed to search class weights
end_time_log = datetime.now()
duration_log = start_time_log - end_time_log 
duration_in_s_log = duration_log.total_seconds()*(-1) 
duration_in_h_log = duration_in_s_log //3600
duration_in_m_log = (duration_in_s_log %3600) //60
print('The search of the right weights for the five classes needed', '%d:%d' \
      %(duration_in_h_log,duration_in_m_log), 'in terms of hours')

#### restricted weight search with Logistic classifier

In [None]:
# reistricted search 
restricted_range_weights_= np.array(
    [0.31, 0.32, 0.33, 0.34, 0.35,0.36,0.37,0.38,0.39,0.40,0.41,0.42,0.43,0.44,\
     0.45,0.46,0.47,0.48,0.49,0.50,0.51,0.52,0.53,0.54,0.55,0.56,0.57,0.58,0.59,\
         0.60,0.61,0.62,0.63,0.64,0.65])
    
grid_parameters2 = {'class_weight': [{0:x, 1:1.0-x, 2:1.0-x, 3:1.0-x, 4:1.0-x} for x in restricted_range_weights_]}

# tracking time
start_time_log2 = datetime.now()

# list to store the results
results_from_weights_log2 = []
counting = 0 
for weigths in grid_parameters2.values():
    for parameters in weigths: 
        counting += 1     
        classifier = SklearnClassifier(LogisticRegression(random_state = 10,
                                                       multi_class='ovr',
                                                       penalty='l2',
                                                       class_weight=parameters)).train(train_set)

        y_train = [(y) for _, y in train_set]
        y_val = [(y) for _, y in val_set]
        pred_clf_train = [classifier.classify(p) for p, _ in train_set]
        pred_clf_val = [classifier.classify(p) for p, _ in val_set]
        scoring_train = f1_score(y_train, pred_clf_train, average='macro')
        scoring_val = f1_score(y_val, pred_clf_val, average='macro')
        print(counting)
        results_from_weights_log2.append([parameters, scoring_train, scoring_val])

# dataframe to look better the results
results_dataframe_log2= pd.DataFrame(results_from_weights_log2, 
                                     columns = ["LOGweights", "F1-Score train", "F1-Score val"])

# time needed to search class weights
end_time_log2 = datetime.now()
duration_log2 = start_time_log2 - end_time_log2 
duration_in_s_log2 = duration_log2.total_seconds()*(-1) 
duration_in_h_log2 = duration_in_s_log2 //3600
duration_in_m_log2 = (duration_in_s_log2 %3600) //60
print('The search of the right weights for the five classes needed', '%d:%d' \
      %(duration_in_h_log2,duration_in_m_log2), 'in terms of hours')


### Grid search class weights with SVM classifier

In [None]:
#initial grid search

# generate 100 values as parameters to test
potential_weights = np.linspace(0.1,0.99,100)
grid_parameters = {'class_weight': [{0:x, 1:1.0-x, 2:1.0-x, 3:1.0-x, 4:1.0-x} for x in potential_weights]}


In [None]:
y_val = [(y) for _, y in val_set]
rstate = 10 
Cs = [0.01,0.1,0.2, 0.3, 0.4,0.5,0.6, 0.7, 0.8, 0.9, 1.0,1.1, 1.2, 1.3, 1.4 ,1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1,2.2,2.3,2.4,2.5]

# second grid search
restricted_range_weights_svc= np.array(
    [0.10, 0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2, 0.21, 0.22, 0.22, 0.23, 0.24,\
     0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0,31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38,\
     0.39, 0.40, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.50])
    
grid_parameters_svc = {'class_weight': [{0:x, 1:1.0-x, 2:1.0-x, 3:1.0-x, 4:1.0-x} for x in restricted_range_weights_svc]}


results_search_hinge =[]

# tracking time 
start_time_search2 = datetime.now()
counting = 0 

for c in Cs:
        for weight in weights_svc:
            counting += 1  
            svc_current = SklearnClassifier(LinearSVC(random_state = 10, 
                                                      max_iter= 200, 
                                                      penalty = 'l2',
                                                      loss = 'hinge',
                                                      multi_class = 'ovr',
                                                      class_weight = parameter, 
                                                      C = c)).train(train_set)
            
            pred_val= [svc_current.classify(p) for p, _ in val_set]
            scoring_val = f1_score(y_val, pred_val, average='macro')
            print(counting)
            results_search_hinge.append([weight, c, scoring_val])

end_time_search2 = datetime.now()
duration_search2 = start_time_search2 - end_time_search2
duration_s_search2= duration_search2.total_seconds()*(-1) 
duration_in_m_svc2 = (duration_s_search2 %3600) //60
print('The search of the best parameter C needed,', duration_in_m_svc2, 'minutes')
# dataframe to look better the results
results_search_svc_hinge= pd.DataFrame(results_search_hinge,columns = ['Weights','C', 'F1-Score val'])


#### Notes about the best class weights according to F1-Macro score in validation set

In [None]:
# =======================NOTES=================================================

# first attempt LOG 
# val : 0.9167
# {0: 0.4236363636363637, 1: 0.5763636363636363, 2: 0.5763636363636363, 3: 0.5763636363636363, 4: 0.5763636363636363}
# {0: 0.4326262626262627, 1: 0.5673737373737373, 2: 0.5673737373737373, 3: 0.5673737373737373, 4: 0.5673737373737373}
# {0: 0.4416161616161617, 1: 0.5583838383838383, 2: 0.5583838383838383, 3: 0.5583838383838383, 4: 0.5583838383838383}

# second attempt LOG 
# val 0.9267
# {0: 0.44, 1: 0.56, 2: 0.56, 3: 0.56, 4: 0.56}

# first attempt SVC 
# val : 0.9367
# {0: 0.11797979797979799, 1: 0.882020202020202, 2: 0.882020202020202, 3: 0.882020202020202, 4: 0.882020202020202}
# {0: 0.12696969696969698, 1: 0.873030303030303, 2: 0.873030303030303, 3: 0.873030303030303, 4: 0.873030303030303}


# second attempt SVC
# val : 0.9467
# best C = 1.8
# {0: 0.11, 1: 0.89, 2: 0.89, 3: 0.89, 4: 0.89}
# {0: 0.11797979797979799, 1: 0.882020202020202, 2: 0.882020202020202, 3: 0.882020202020202, 4: 0.882020202020202}

# =============================================================================