In [None]:
# NLTK manual installation

# pip install nltk==3.6.5

# needed downgraded from 3.6.6 to 3.6.5 due to a bug in NLTK that occurs when a 
# .,? and ! are placed at the beginning of the sentence / string

# in prompt set NLTK in the environment
# set NLTK_DATA=H:\nltk_data

# IMPORT LIBRARIES
import os
import re
import pandas as pd
import numpy as np
import regex
import pickle
import datetime
from datetime import datetime
import itertools

import nltk             
from nltk.corpus import stopwords
from nltk import word_tokenize

from nltk.metrics import ConfusionMatrix
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

# setting directory
os.chdir('K:\Specialemappe_XD1')

# check NLTK´s operativity without the bug encountered with NLTK 3.6.6
check_tokenizer = '.?!Den nuværende sætning er en eksempel til at teste tokenizer og at nå en potentielle bug'
check_nltk_operativity = word_tokenize(check_tokenizer, language= 'danish')
print(check_nltk_operativity)

In [None]:
# load labelled sample
load_sample=pd.read_excel('labelled_sample.xlsx')
load_sample.columns

# identify any NaN values
load_sample.isnull().values.any()

### Removing instances that are rejected by the same schooling institutions

In [None]:
# deleting 187 instances not recognized by educational institutions or with mixed classes
rows_to_delete = load_sample.loc[load_sample['instances_to_discard']=='delete']
load_sample.drop(load_sample.loc[load_sample['instances_to_discard']=='delete'].index, inplace=True)


In [None]:
# data dropping drop useless columns
df = load_sample.drop(columns=['Unnamed: 0', 'index', 'instances_to_discard'])
df.columns


### DATA IMBALANCE

In [None]:

# count instances per classes
df['set_label'].value_counts()

# set interested classes with integer labels 
df.loc[df['set_label'] == 'dansk', 'set_label'] = 1
df.loc[df['set_label'] == 'idræt', 'set_label'] = 2
df.loc[df['set_label'] == 'matematik', 'set_label'] = 3
df.loc[df['set_label'] == 'temporary', 'set_label'] = 4
df.loc[df['set_label'] == 'other', 'set_label'] = 0

# set all other classes as the class '0'
df['set_label'] = np.where(
    (df['set_label'] !=1) & (df['set_label'] !=2) & (df['set_label'] !=3) & (df['set_label'] !=4), 0, df['set_label'])
df

# class imbalance 
df['set_label'].value_counts() 

# 0 = 43674, 84.29
# 1 = 888, 1.71
# 2 = 810, 1.56
# 3 = 950, 1.83
# 4 = 5491, 10.59

### PRE-PROCESSING

In [None]:
# - _ / Â are most likely to induce  misclassification errors 
# some numbers and § are often used by schools; 

df_to_clean = df

# remove unicode characters and numbers from data (noise)
char_noise = r'[-_()\"#@;:`.''!?*´.:;,<>=+^Â/]'
df_to_clean['name_activity']= df_to_clean['name_activity'].apply(lambda x: re.sub(char_noise, ' ', x))
digit_noise= r'[0123456789]'
df_to_clean['name_activity']= df_to_clean['name_activity'].apply(lambda x: re.sub(digit_noise, '', x))

# # delete artficially created double whitespaces
df_to_clean['name_activity']= df_to_clean['name_activity'].apply(lambda x: x.replace('  ', ' '))
# # needed a 2nd time
df_to_clean['name_activity']= df_to_clean['name_activity'].apply(lambda x: x.replace('  ', ' '))

# cleaned df 
df_cleaned = df_to_clean

In [None]:
## FEATURE EXTRACTION

# variable containing name activities
content_cleaned = df_cleaned['name_activity']

# text composed by the name of the activites from which word features can be obtained
content_text_cleaned=[]

# variable containing Danish stopwords
dansk_stopwords = set(stopwords.words('danish'))

for name in content_cleaned:
    #lowercasing
    name = name.lower()
    # characters and numbers has been removed but remains §
    # substitute with whitespace to make possible a better tokenization
    
    #tokenizing 
    name = word_tokenize(name, language= 'danish')
    
    for token in name:
        if token != ' ':
            if token != '':
                if token not in dansk_stopwords:
                    content_text_cleaned.append(token)

print('Example of the output:')
print(content_text_cleaned[100:150])

# variable containing the most frequent words
distr_cleaned_words = nltk.FreqDist(w for w in content_text_cleaned) 

Example of the output:
['gård', 'mellem', 'bygn', 'lundegade', 'klyngetid', 'sammen', 'fie', 'forberede', 'prøvehandlinger', 'netværksmøder', 'naturfag', 'læ', 'vej', 'tovholder', 'bordtennis', 'badmintontræf', 'cb', 'b', 'dansk', 'lat', 'sfo', 'aftenfest', 'sfo', 'kl', 'projektopgaver', 'hjemmeundervisn', 'corona', 'x', 'x', 'f', 'd', 's', 'modul', 'b', 'ik', 'plf', 'workshop', 'årgang', 'klasselærerdag', 'gårdvagt', 'gymnastiksalen', 'overlevering', 'tysk', 'vikar', 'pauser', 'klasse', 'spis', 'gårdvagt', 'kids', 'volley']

In [None]:
# feature selector
def document_features(document, word_features):
    
        document_words = set(document)
        features = {}
        for word in word_features :
            features['contains ({})'.format(word)] = (word in document_words)
        return features

# features
mostfq_word_cleaned= nltk.FreqDist(w for w in content_text_cleaned) 
word_features_cleaned = list(mostfq_word_cleaned)[:400]

# additional RULE BASED features
rule_features_cleaned = [
    'matematikkens', 'geogebra', 'ing', 'svø', 'vø', 'svøm',\
    'atletik', 'basket','kids', 'kidsvolley', 'volley', 'motion',\
    'da', 'dsa', 'andet', 'andetsprog','dansk andet sprog',\
    'børnhave', 'basis', 'basisdansk',\
    'fp','pf', 'eksam',\
    'planlægning', '§',\
    'ffmat', 'ffdan', 'klassemøde', 'læringssamtale', 'fasa', 'konference',\
    'vej', 'vejledning', 'læsevejledere', 'matematikvejleder', 'idrv', 'matv',\
    'vikar', 'eks',\
    'skal ikke', 'kørsel', 'tur', 'studietur', 'kommunale', 'fælleskommnunal','skoleintro', 'praktik']


word_feat_cleaned = list(itertools.chain(word_features_cleaned, rule_features_cleaned))
len(word_feat_cleaned)
#save features cleaned
# with open('all_features', 'wb') as fp:
#     pickle.dump(word_feat_cleaned, fp)


### SPLIT DATA

In [None]:
def train_val_test(text,cutoffs=[0.8,0.9]):
    train = text[:int(len(text)*cutoffs[0])]
    val = text[int(len(text)*cutoffs[0]):int(len(text)*cutoffs[1])]
    test = text[int(len(text)*cutoffs[1]):]
    return train, val, test

train_content, val_content, test_content = train_val_test(df_cleaned)
train_content['set_label'].value_counts()   #  >654 per class
val_content['set_label'].value_counts()     # >71 perr class
test_content['set_label'].value_counts()    # >85 per class
 
#training set building TUPLE
train_data_tuple = (train_content['name_activity'], train_content['set_label']) #create dataframe tuple
set_training=[]
counter=0 #used for indexing
for _ in train_data_tuple[0]:
    set_training.append([(i.iloc[counter]) for i in train_data_tuple]) #append name emne and its label as a tuple
    counter+=1
    
#validation set building TUPLE
val_data_tuple = (val_content['name_activity'], val_content['set_label']) #create dataframe tuple
set_validation=[]
counter=0 #used for indexing
for _ in val_data_tuple[0]:
    set_validation.append([(i.iloc[counter]) for i in val_data_tuple]) #append name emne and its label as a tuple
    counter+=1

#test set building TUPLE
test_data_tuple=(test_content['name_activity'], test_content['set_label']) #create dataframe tuple
set_test=[]
counter=0 #used for indexing
for _ in test_data_tuple[0]:
    set_test.append([(i.iloc[counter]) for i in test_data_tuple]) #append name emne and its label as a tuple
    counter+=1

print("Training tuple length:", len(set_training))
print("Validation tuple length:", len(set_validation))
print("Test tuple length:", len(set_test))


train_set = [(document_features(word_tokenize(w.lower(), language='danish'),word_feat_cleaned), label) 
             for w, label in set_training]# Training set - def document_feature on each string and pair with class

val_set = [(document_features(word_tokenize(w.lower(), language='danish'),word_feat_cleaned), label) 
           for w, label in set_validation]# Validation set - def document_feature on each string and pair with class 

test_set = [(document_features(word_tokenize(w.lower(), language='danish'),word_feat_cleaned), label) 
            for w, label in set_test]# Test set - def document_feature on each string and pair with class


#### Training tuple length: 41450  

#### Validation tuple length: 5181
#### Test tuple length: 5182

## CLASSIFIER : NAIVE BAYES

In [None]:
clfNB_cl = nltk.NaiveBayesClassifier.train(train_set)

# classes
target_names = ['0','1','2', '3', '4'] 

# performance on validation set
y_val = [(y) for _, y in val_set]
pred_NBcl_val = [clfNB_cl.classify(p) for p, _ in val_set]
f1_NBcl_val = f1_score(y_val, pred_NBcl_val, average='macro')
print("Naive Bayes Classifier´s F1-Score val_set:", round(f1_NBcl_val, 4),"\n")
confusion_matrix(y_val, pred_NBcl_val)
# Print classification report
print(classification_report(y_val, pred_NBcl_val, target_names=target_names))

#performance on test set
y_test = [(y) for _, y in test_set]
pred_NBcl_test = [clfNB_cl.classify(p) for p, _ in test_set]
f1_NBcl_test = f1_score(y_test, pred_NBcl_test, average='macro') 
print("Naive Bayes Classifier´s F1-Score test_set:", round(f1_NBcl_test, 4),"\n")
confusion_matrix(y_test, pred_NBcl_test)
# Print classification report
print(classification_report(y_test, pred_NBcl_test, target_names=target_names))

# # save model
# with open('NB_cleaned.pickle', 'wb') as f:
#     pickle.dump(clfNB, f)

In [None]:
info_NB_cleaned= clfNB_cl.show_most_informative_features(100)


## CLASSIFIER : LOGISTIC CLASSIFIER

In [None]:
#weights after grid search
weights_log = {0: 0.44, 1: 0.56, 2: 0.56, 3: 0.56, 4:0.56}

clfLOG_cl = SklearnClassifier(LogisticRegression(random_state = 10,
                                               multi_class='ovr',
                                               penalty='l2',
                                               # class_weight= weights_cl
                                               class_weight=weights_log)).train(train_set)


# performance on validation set
y_val = [(y) for _, y in val_set]
pred_logcl_val = [clfLOG_cl.classify(p) for p, _ in val_set]
f1_logcl_val = f1_score(y_val, pred_logcl_val, average='macro')
print("Logistic Classifier´s F1-Score val_set:", round(f1_logcl_val, 4),"\n")
# confusion_matrix(y_val, pred_logcl_val)
print(classification_report(y_val, pred_logcl_val, target_names=target_names))


# performance on test set
y_test = [(y) for _, y in test_set]
pred_logcl_test = [clfLOG_cl.classify(p) for p, _ in test_set]
f1_logcl_test = f1_score(y_test, pred_logcl_test, average='macro') 
print("Logistic Classifier´s F1-Score test_set:", round(f1_logcl_test, 4),"\n")
# confusion_matrix(y_test, pred_logcl_test)
print(classification_report(y_test, pred_logcl_test, target_names=target_names))

# # save model
# with open('LOG_cleaned.pickle', 'wb') as f:
#     pickle.dump(clfLOG, f)


## CLASSIFIER : SVM

In [None]:
### SVM linear : SVC + loss = 'hinge'

# weights after grid search
svm_weights = {0: 0.11,1: 0.89, 2: 0.89,3: 0.89, 4: 0.89}

clfSVM_cl = SklearnClassifier(LinearSVC(random_state=10,
                                        C=1.8,
                                        penalty = 'l2',
                                        loss = 'hinge',
                                        multi_class='ovr',
                                        class_weight=svm_weights)).train(train_set)

# performance on validation set
y_val = [(y) for _, y in val_set]
pred_svm_val_cl = [clfSVM_cl.classify(p) for p, _ in val_set]
f1_svm_val = f1_score(y_val, pred_svm_val_cl, average='macro')
print("SVM linear´s F1-Score val_set:", round(f1_svm_val, 4),"\n")
# confusion_matrix(y_val, pred_svm_val)
print(classification_report(y_val, pred_svm_val_cl, target_names=target_names))

# performance on test set
y_test = [(y) for _, y in test_set]
pred_svm_test_cl = [clfSVM_cl.classify(p) for p, _ in test_set]
f1_svm_test = f1_score(y_test, pred_svm_test_cl, average='macro') 
print("SVM linear´s F1-Score test_set:", round(f1_svm_test, 4),"\n")
# confusion_matrix(y_test, pred_svm_test)
print(classification_report(y_test, pred_svm_test_cl, target_names=target_names))

# # save model
# with open('SVM_cleaned.pickle', 'wb') as f:
#     pickle.dump(clfSVM_cl, f)

### subjective evaluation on activities occurring more than 500 times

In [None]:
#%% how many activities occurring equal or more than 500 

#are within this sample? 25.15%

load_act500=pd.read_excel('act_occuring_more500.xlsx')

load_act500.columns

df_500 = load_act500.drop(columns=['Unnamed: 0', 'initial_index'])
df_500.columns

name_500 = list(set(df_500['Aktivitet navn'].to_list()))
print(len(name_500))

df_sample = list(set(df['name_activity'].to_list()))

intersection_sample = [name for name in name_500 if name in df_sample]
print(len(intersection_sample))

# how many within the training set? 19.82 %
list_train = list(set(train_content['name_activity'].to_list()))
intersection_train = [name for name in name_500 if name in list_train]
print(len(intersection_train))

# how many within the miclassification errors made by svm?
# validation set 
val_set_SVM = val_content
val_set_SVM['Predictions'] = pred_svm_val 
val_set_SVM['SVM Errors'] = val_set_SVM['Predictions'] == val_set_SVM['set_label']
SVM_errors_val = val_set_SVM[val_set_SVM['SVM Errors'] == False]
len(SVM_errors_val)

SVM_val_list = list(set(SVM_errors_val['name_activity'].to_list()))
svm_val_errors_intersection = [name for name in name_500 if name in SVM_val_list]
print(len(svm_val_errors_intersection))

# test set
test_set_SVM = test_content
test_set_SVM['Predictions'] = pred_svm_test
test_set_SVM['SVM  Errors'] = test_set_SVM['Predictions'] == test_set_SVM['set_label']
SVM_errors_test = test_set_SVM[test_set_SVM['SVM  Errors'] == False]
len(SVM_errors_test)
SVM_test_list = list(set(SVM_errors_test['name_activity'].to_list()))
svm_test_errors_intersection = [name for name in name_500 if name in SVM_test_list]
print(len(svm_test_errors_intersection))