In [None]:
# import libraries
import os
import re
import pandas as pd
import numpy as np
import regex
import pickle
import datetime
from datetime import datetime
import itertools

import nltk             
from nltk.corpus import stopwords
from nltk import word_tokenize

from nltk.metrics import ConfusionMatrix
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

# setting directory
os.chdir('K:\Specialemappe_XD1')

In [None]:
# load labelled sample
load_sample=pd.read_excel('labelled_sample.xlsx')
load_sample.columns

# identify any NaN values
load_sample.isnull().values.any()

# deleting 187 instances not recognized by educational institutions or with mixed classes
rows_to_delete = load_sample.loc[load_sample['instances_to_discard']=='delete']
load_sample.drop(load_sample.loc[load_sample['instances_to_discard']=='delete'].index, inplace=True)

# data dropping drop useless columns
df = load_sample.drop(columns=['Unnamed: 0', 'index', 'instances_to_discard'])
df.columns

# count instances per classes
df['set_label'].value_counts()

# set interested classes with integer labels 
df.loc[df['set_label'] == 'dansk', 'set_label'] = 1
df.loc[df['set_label'] == 'idræt', 'set_label'] = 2
df.loc[df['set_label'] == 'matematik', 'set_label'] = 3
df.loc[df['set_label'] == 'temporary', 'set_label'] = 4
df.loc[df['set_label'] == 'other', 'set_label'] = 0

# set all other classes as the class '0'
df['set_label'] = np.where(
    (df['set_label'] !=1) & (df['set_label'] !=2) & (df['set_label'] !=3) & (df['set_label'] !=4), 0, df['set_label'])
df
# class imbalance 
df['set_label'].value_counts() 

### Feature extraction on non pre-processed data

In [None]:
# variable containing name activities
content = df['name_activity']

# text composed by the name of the activites from which word features can be obtained
content_text=[]

# variable containing Danish stopwords
dansk_stopwords = set(stopwords.words('danish'))

for name in content:
     #lowercasing
    name = name.lower()
    # substitute with whitespace to make possible a better tokenization
    name = re.sub('[-_()\"#@;:`.''!?*´:;,<>=+^]', ' ', name)
    # substitute with whitespace to make possible a better tokenization
    name = re.sub('[4578]', ' ', name) 
    #tokenizing 
    name = word_tokenize(name, language= 'danish')
    
    for token in name:
        if token != ' ':
            if token != '':
                if token not in dansk_stopwords:
                    content_text.append(token)

print('Example of the output:')
print(content_text[100:150])

# variable containing the most frequent words
distribution_used_words = nltk.FreqDist(w for w in content_text) 

# feature selector
def document_features(document, word_features):
    
        document_words = set(document)
        features = {}
        for word in word_features :
            features['contains ({})'.format(word)] = (word in document_words)
        return features

# word features
most_fq_words = nltk.FreqDist(w for w in content_text) 
word_features_400 = list(most_fq_words)[:400]

# additional features
rule_based_features = [
    'matematikkens', 'geogebra', 'ing', 'svø', 'vø', 'svøm',\
    'atletik', 'basket','kids', 'kidsvolley', 'volley', 'motion',\
    'da2','dsa', 'andet','andetsprog', 'dansk andet sprog',\
    '0.', 'børnhave', 'basis', 'basisdansk',\
    'fp9','pf9', 'eksam',\
    'planlægning','a0','16d', '16b', '§16', '§',\
    'ffmat', 'ffdan', 'klassemøde', 'læringssamtale', 'fasa', 'konference'\
    'vej', 'vejledning', 'læsevejledere', 'matematikvejleder', 'idrv', 'matv'\
    'vikar', 'eks'
    'skal ikke', 'kørsel', 'studietur', 'kommunale', 'fælleskommnunal','skoleintro', 'praktik']

word_features_all = list(itertools.chain(word_features_400, rule_based_features))

#save features with noise data
# with open('all_features_noise', 'wb') as fp:
#     pickle.dump(word_features_all, fp)

In [None]:
#%% splitting data and datasets building

def train_val_test(text,cutoffs=[0.8,0.9]):
    train = text[:int(len(text)*cutoffs[0])]
    val = text[int(len(text)*cutoffs[0]):int(len(text)*cutoffs[1])]
    test = text[int(len(text)*cutoffs[1]):]
    return train, val, test

train_content, val_content, test_content = train_val_test(df)
train_content['set_label'].value_counts()   # number of instances >= 655 per class
val_content['set_label'].value_counts()     # number of instances >= 71 per class
test_content['set_label'].value_counts()    # number of instances >= 85 per class
 
#training set building TUPLE
train_data_tuple = (train_content['name_activity'], train_content['set_label']) #create dataframe tuple
set_training=[]
counter=0 #used for indexing
for _ in train_data_tuple[0]:
    set_training.append([(i.iloc[counter]) for i in train_data_tuple]) #append name emne and its label as a tuple
    counter+=1
    
#validation set building TUPLE
val_data_tuple = (val_content['name_activity'], val_content['set_label']) #create dataframe tuple
set_validation=[]
counter=0 #used for indexing
for _ in val_data_tuple[0]:
    set_validation.append([(i.iloc[counter]) for i in val_data_tuple]) #append name emne and its label as a tuple
    counter+=1

#test set building TUPLE
test_data_tuple=(test_content['name_activity'], test_content['set_label']) #create dataframe tuple
set_test=[]
counter=0 #used for indexing
for _ in test_data_tuple[0]:
    set_test.append([(i.iloc[counter]) for i in test_data_tuple]) #append name emne and its label as a tuple
    counter+=1

print("Training tuple length:", len(set_training))
print("Validation tuple length:", len(set_validation))
print("Test tuple length:", len(set_test))


train_set = [(document_features(word_tokenize(w.lower(), language='danish'),word_features_all), label) 
             for w, label in set_training]# Training set - def document_feature on each string and pair with class

val_set = [(document_features(word_tokenize(w.lower(), language='danish'),word_features_all), label) 
           for w, label in set_validation]# Validation set - def document_feature on each string and pair with class 

test_set = [(document_features(word_tokenize(w.lower(), language='danish'),word_features_all), label) 
            for w, label in set_test]# Test set - def document_feature on each string and pair with class

### Classifier: Naive Bayes on noise data

In [None]:
clfNB = nltk.NaiveBayesClassifier.train(train_set)

# classes
target_names = ['0','1','2', '3', '4'] 

# performance on validation set
y_val = [(y) for _, y in val_set]
pred_NB_val = [clfNB.classify(p) for p, _ in val_set]
f1_NB_val = f1_score(y_val, pred_NB_val, average='macro')
print("F1-Score val_set:", round(f1_NB_val, 4),"\n")
confusion_matrix(y_val, pred_NB_val)
# Print classification report
print(classification_report(y_val, pred_NB_val, target_names=target_names))

#performance on test set
y_test = [(y) for _, y in test_set]
pred_NB_test = [clfNB.classify(p) for p, _ in test_set]
f1_NB_test = f1_score(y_test, pred_NB_test, average='macro') 
print("F1-Score test_set:", round(f1_NB_test, 4),"\n")
confusion_matrix(y_test, pred_NB_test)
# Print classification report
print(classification_report(y_test, pred_NB_test, target_names=target_names))

## save model
# with open('NB_notcleaned.pickle', 'wb') as f:
#     pickle.dump(clfNB, f)

### Classifier: Logistic regression on noise data

In [None]:
#weights after grid search
weights_log = {0: 0.44, 1: 0.56, 2: 0.56, 3: 0.56, 4:0.56}

clfLOG = SklearnClassifier(LogisticRegression(random_state = 10,
                                               multi_class='ovr',
                                               penalty='l2',
                                               class_weight=weights_log)).train(train_set)


# performance on validation set
y_val = [(y) for _, y in val_set]
pred_log_val = [clfLOG.classify(p) for p, _ in val_set]
f1_log_val = f1_score(y_val, pred_log_val, average='macro')
print("F1-Score val_set:", round(f1_log_val, 4),"\n")
confusion_matrix(y_val, pred_log_val)
print(classification_report(y_val, pred_log_val, target_names=target_names))


# performance on test set
y_test = [(y) for _, y in test_set]
pred_log_test = [clfLOG.classify(p) for p, _ in test_set]
f1_log_test = f1_score(y_test, pred_log_test, average='macro') 
print("F1-Score test_set:", round(f1_log_test, 4),"\n")
confusion_matrix(y_test, pred_log_test)
print(classification_report(y_test, pred_log_test, target_names=target_names))


# # save model
# with open('LOG_notcleaned.pickle', 'wb') as f:
#     pickle.dump(clfLOG, f)


### Classifier : SVM on noise data

In [None]:
#weights after grid search
weights_svm = {0: 0.11,1: 0.89, 2: 0.89,3: 0.89, 4: 0.89}

clfSVM = SklearnClassifier(LinearSVC(random_state=10, 
                                     C=1.8,
                                     penalty = 'l2',
                                     loss = 'hinge',
                                     multi_class='ovr',
                                     class_weight=weights_svm)).train(train_set)

# performance on validation set
y_val = [(y) for _, y in val_set]
pred_svm_val= [clfSVM.classify(p) for p, _ in val_set]
f1_svm_val = f1_score(y_val, pred_svm_val, average='macro')
print("F1-Score val_set:", round(f1_svm_val, 4),"\n")
# confusion_matrix(y_val, pred_svm_val)
print(classification_report(y_val, pred_svm_val, target_names=target_names))

# performance on test set
y_test = [(y) for _, y in test_set]
pred_svm_test = [clfSVM.classify(p) for p, _ in test_set]
f1_svm_test = f1_score(y_test, pred_svm_test, average='macro') 
print("F1-Score test_set:", round(f1_svm_test, 4),"\n")
# confusion_matrix(y_test, pred_svm_test)
print(classification_report(y_test, pred_svm_test, target_names=target_names))

# save model
#with open('SVM_notcleaned.pickle', 'wb') as f:
#    pickle.dump(clfSVM, f)