# Importing libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load
import numpy as np
from matplotlib import pyplot as plt
plt.rcParams["font.family"] = "Times New Roman"
plt.style.use('tableau-colorblind10')

# loading training corpus

In [None]:
import os

def load_corpus(path,n,m=0,letters='',ending='.txt',split=False,lower=False):
    corpus = []
    
    for i in range(m,n):
        filename = letters+f"{i:06}"+ending  
        filepath = os.path.join(path, filename)
        
        if os.path.exists(filepath):
            with open(filepath, 'r', encoding='utf-8') as file:
                document = file.read()
                if lower:
                    document = document.lower()
                if split:
                    document = document.split(split)
                corpus.append(document)
                
    return corpus

In [None]:
def getLargeCorpus(loadComponents=False,
                   writeFile=False,
                  openCorpus = True):
    if loadComponents:
        # Loading training corpora
        #NOREC
        path = "resources/norec/data/train/"
        norec = load_corpus(path,700000)

        #Kristiania course evaluations
        filename = 'Evaluation_2020_2021-1'
        path = 'Input/'
        kristiania = pd.read_csv(path+filename+'.csv', low_memory=False).Comments.dropna().tolist()

        #parliament speeches
        path = 'resources/talk-of-norway/'
        stortinget = pd.read_csv(path+'ton_updated.csv').text.tolist()

        corpus = norec+stortinget+kristiania
        len(corpus)
        return corpus
    if writeFile:
        import re
        with open('resources/trainingData/combinedTrainingCorpus.txt','w') as file:
            for row in corpus: 
                file.write(re.sub('\n',' ',row))
                file.write('\n')
    if openCorpus:
        with open('resources/trainingData/combinedTrainingCorpus.txt','r') as file:
            corpus = file.read().split('\n')
        return corpus
corpus = getLargeCorpus(openCorpus=False)

In [None]:
class LemmaUpdater:
    def __init__(self):
        self.lemmaReg = {}

    def update(self, speech):
        """Update lemmaReg based on the provided speech."""
        for word in speech:
            if type(word) == list and len(word) > 3:
                if word[1] != word[2]:
                    if word[1] not in self.lemmaReg and word[1].isalnum():
                        self.lemmaReg[word[1]] = word[2]
        #return self.lemmaReg
    def getLemmas(self):
        return self.lemmaReg

In [None]:
#create combined preprocessed training corpus including lemmatization
def getLargeLemmatizer(getLemmas=False,
                      writeToDisk=False,
                      importLemmas=True):
    if getLemmas:
        updater = LemmaUpdater()
        from datetime import datetime
        from tqdm import tqdm
        path = 'resources/talk-of-norway/annotations/'
        totalNumber=251000
        batchSize=10000
        main_bar = tqdm(total=totalNumber/batchSize,desc='Learning lemmatizations',leave=True)
        main_bar.update(0)

        for i in range(round(totalNumber/batchSize)):
            print('loading batch '+str(i)+' of '+str(totalNumber/batchSize))
            tonAnnot = load_corpus(path,
                                   i*batchSize,
                                   i*batchSize-batchSize,
                                   'tale',
                                   '.tsv',
                                   '\n',
                                   lower=True)

            sub_bar = tqdm(total=len(tonAnnot)-1, desc="Processing batch", leave=True)
            sub_bar.update(0)
            batch_beg = datetime.now()
            for i,speech in enumerate(tonAnnot):
                processed = [item.split('\t') for item in speech]
                updater.update(processed)
                if i%1000==0:
                    sub_bar.update(i)
            batch_end = datetime.now()
            sub_bar.close()
            lemmaReg = updater.getLemmas()
            print('spent:',batch_end-batch_beg)
            print('n keys:',len(lemmaReg.keys()))
            print('n lemmas:',len(set(lemmaReg.values())))
            main_bar.update(i)
    if writeToDisk:
        print('writing to disk')
        filename = ('trainedLemmatizer'
                    '.csv')
        pd.DataFrame(lemmaReg,index=['lemmatized']).T.reset_index().to_csv('resources/fittedModels/'+filename,index=False)
        print('done, and written to disk as '+filename)
    if importLemmas:
        import pandas as pd
        lemmaReg = dict(zip(pd.read_csv(
            'resources/fittedModels/trainedLemmatizer.csv')['index'],pd.read_csv(
            'resources/fittedModels/trainedLemmatizer.csv')['lemmatized']))
    return lemmaReg

lemmaReg = getLargeLemmatizer()

In [None]:
from tabulate import tabulate
print(tabulate(
    pd.DataFrame(
        lemmaReg,index=['Lemmatized']).T.reset_index().rename(
        columns={'index':'Word'}).set_index('Word').iloc[:10,:],tablefmt='latex'))

# Preprocessing training corpus

In [None]:
#preprocessing
#creating functions for preprocessing, tailoring them to work 
#with norwegian language and this dataset in particular


def preprocessFunction(corpusToProcess):
    
    from collections import Counter
    import re
    lemmaReg = getLargeLemmatizer()
    
    with open('resources/fittedModels/stopwords_from_corpus.txt','r') as stopfile:
            stops = stopfile.read().split('\n')
    
    def stopTester(w):
        """taking a string consiting of one word and possibly punctuation
        and returning the word and if present the punctuation 
        (.,!? or combinations) 
        as two elements of a list. Additionally, if the word has a lemmatized
        form in the lemmaReg input, that form is returned"""
        w = list(w)
        t=[]
        while w and w[-1] in ['.',',','!','?']:
            t.append(w.pop())
        w = ''.join(w)
        #
        t = ''.join(t)
        return [w,t]

    def tokenizer(doc):
        """taking a string containing a sentence/document,
        splitting on words and removing special characters"""
        removeChars = '[^A-Za-z0-9.æäöøåÆØÅÄÖ]'
        #removeChars1 = '[^A-Za-z0-9.]'
        tokens = []
        for word in doc.lower().split():
            if word[-1:].isalnum():
                word = re.sub(removeChars,'',word)
                tokens.append(word) 
            else:
                words = stopTester(word)
                word = re.sub(removeChars,'',words[0])
                tokens.append(word)
                tokens.append(words[1])     
        return tokens

    def wordCounter(doc):
        """"taking a tokenized document, returning 
        individual word count"""
        return Counter(doc)

    #creating the preprocessing function to perform all preprocessing 
    #tasks with one document
    def preprocess(doc,stops,lemmaReg):
        """taking a non-tokenized document and a list of stopwords,
        returning a preprocessed (tokenized) document"""
        if type(doc) == str:

            return (' ').join([str(lemmaReg.get(word,word)) for word in tokenizer(doc) if word 
                    and word not in stops])
        else:
            return doc
    
    from tqdm import tqdm
    main_bar = tqdm(total=len(corpusToProcess),desc='cleaning, tokenizing, removing stopwords, lemmatizing',leave=True)
    main_bar.update(0)
    from datetime import datetime
    t0=datetime.now()
    corpus_preprocessed = []
    for i,doc in enumerate(corpusToProcess):
        corpus_preprocessed.append((preprocess(doc,stops,lemmaReg)))
        if i%10000 == 0:
            main_bar.update(i)
    t1=datetime.now()
    main_bar.close()
    print('done, samples:',len(corpusToProcess))
    print('time:',t1-t0)
    return corpus_preprocessed

In [None]:
def getPreprocessedCorpus(preprocessCorpus=False,
                          writePreprocessedCorpus=False,
                          readPreprocessedCorpus=True):
    if preprocessCorpus:
        corpus_preprocessed = preprocessFunction(corpus)
        #with open('resources/fittedModels/stopwords_from_corpus.txt','r') as stopfile:
        #    stops = stopfile.read().split('\n')

        #corpusToProcess = corpus
        #from tqdm import tqdm
        #main_bar = tqdm(total=len(corpusToProcess),desc='cleaning, tokenizing, removing stopwords, lemmatizing',leave=True)
        #main_bar.update(0)
        #from datetime import datetime
        #t0=datetime.now()
        #corpus_preprocessed = []
        #for i,doc in enumerate(corpusToProcess):
        #    corpus_preprocessed.append((preprocess(doc,stops,lemmaReg)))
        #    if i%10000 == 0:
        #        main_bar.update(i)
        #t1=datetime.now()
        #main_bar.close()
        #print('done, samples:',len(corpusToProcess))
        #print('time:',t1-t0)
        if writePreprocessedCorpus:
            print('writing to disk')
            with open('resources/trainingData/combinedTrainingCorpus_preprocessed.txt','w') as file:
                for row in corpus: 
                    file.write(row)
                    file.write('\n')
            print('done')
    if readPreprocessedCorpus:
        with open('resources/trainingData/combinedTrainingCorpus_preprocessed.txt','r') as file:
            corpus = file.read().split('\n')
            print('read preprocessed corpus for training from file')
            print('corpus length',len(corpus))
    return corpus

corpus = getPreprocessedCorpus()

# creating models (tfidf)

### Learning a large norwegian vocabulary

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load
import numpy as np
from scipy.sparse import csr_matrix, lil_array, hstack, vstack

In [None]:
def getLargeVocabulary(learnVocabulary=True,
                  min_df = 1,
                 corpus=corpus,
                 lemmas=lemmaReg):
    if learnVocabulary:
        from sklearn.feature_extraction.text import TfidfVectorizer
        import numpy as np

        vectorizer = TfidfVectorizer(min_df=min_df)
        tfidf_matrix = vectorizer.fit_transform(corpus).astype(np.float32)
        print('learnt the vocabulary from the docs')
        del tfidf_matrix
        del corpus
        return set([lemmas.get(word,word) for word in vectorizer.get_feature_names_out().tolist() if word.isalpha()])
vocabularyLearned = getLargeVocabulary(learnVocabulary=True, min_df=0.0005)

In [None]:
len(vocabularyLearned)

### supplementing with __all__ words (lemmatized form) except stopwords from the kristiania data

In [None]:
vocabSize = 16000
nComps = 300 # 300

In [None]:
def createCombinedVocab(createCombinedVocabulary = False,
                        writeCombinedVocabulary = False,
                        importVocabulary = True,
                        vocabularyLearned=vocabularyLearned,
                       lenStr=16000):
    if createCombinedVocabulary:
        with open('resources/trainingData/vocabulary_from_corpus.txt','r') as file:
            vocabularyCorpus = file.read().split('\n')
        vocabularyCorpus = set([lemmaReg.get(word,word) for word in vocabularyCorpus if word.isalpha()])
        vocabulary = vocabularyLearned | vocabularyCorpus
        del vocabularyLearned, vocabularyCorpus
        lenStr = len(vocabulary)
        print('combined vocabulary, length',lenStr)
        if writeCombinedVocabulary:
            with open('resources/fittedModels/vocabularyTotal_'+str(lenStr)+'_.txt','w') as file:
                file.write('\n'.join(vocabulary))
            print('wrote combined vocabulary to disk')
    if importVocabulary:
        with open('resources/fittedModels/vocabularyTotal_'+
                  str(lenStr)+
                  '.txt','r') as file:
            vocabulary = file.read().split('\n')
            print('read the total vocabulary from file')
    vocabularyIdx = {v:k for k,v in enumerate(vocabulary)}
    
    print(str(len(vocabulary)))

    return vocabulary, vocabularyIdx

vocabulary,vocabularyIdx = createCombinedVocab(createCombinedVocabulary = False,
                        writeCombinedVocabulary = False,
                        importVocabulary = True,
                         lenStr=vocabSize)

In [None]:
def plotCumVar(svd,
               modelType = 'Tfidf',
              saveFig=True):
    lenStr = svd.n_features_in_
    n_components = svd.n_components
    filename=('CumVar_svd'
                +str(n_components)
                +'_'
              +modelType
              +'_'
                +str(lenStr)
                +'.png')
    import numpy as np
    cumulative_variance = np.cumsum(svd.explained_variance_ratio_)
    n_components = svd.n_components
    fig, cumVar = plt.subplots(1,1,figsize=(5,3.5),dpi=240)
    cumVar.plot(range(n_components), cumulative_variance)
    cumVar.set_xlabel('Number of Components')
    cumVar.set_ylabel('Cumulative Explained Variance')
    cumVar.set_title('Cumulative Variance Explained')
    cumVar.set_ylim(0,1)
    cumVar.grid(True)
    if saveFig:
        plt.savefig('results and diagrams/'+filename)
        print('saved figure as '+filename)
    plt.show()

In [None]:
def getVectorizerSVDmatrix(trainSVD=False,
                           writeSVD=False,
                           fitVectorizer=False,
                           writeVectorizer=False,
                           readVectorizer=True,
                           writeTfidf=False,
                           readTfidf=False,
                           readSVD=True,
                          vocabularyIdx=vocabularyIdx,
                          corpus=corpus,
                           comps=nComps,
                           lenStr=vocabSize
                          ):
    tfidf_matrix=np.array([0])
    
    lenStr = len(vocabularyIdx)
    if fitVectorizer:
        print('fitting tfidf')
        vectorizer = TfidfVectorizer(vocabulary=vocabularyIdx)
        tfidf_matrix = vectorizer.fit_transform(corpus).astype(np.float32)


    if writeVectorizer:
        dump(vectorizer, 'resources/fittedModels/fittedTfidfVectorizer_'
             +str(lenStr)
             +'.joblib')
        print('stored trained TfidfVectorizer to disk') 

    if readVectorizer:
        vectorizer = load('resources/fittedModels/fittedTfidfVectorizer_'
             +str(lenStr)
             +'.joblib')
        print('read trained TfidfVectorizer from disk') 

    if writeTfidf:
        dump(tfidf_matrix, 'resources/trainingData/tfidf_matrix_training_data_'
             +str(lenStr)
             +'.joblib')
        print('stored Tfidf matrix to disk') 

    if readTfidf:
        tfidf_matrix = load('resources/trainingData/tfidf_matrix_training_data_'
             +str(lenStr)
             +'.joblib')
        print('read Tfidf matrix from disk') 

    if trainSVD:
        from sklearn.decomposition import TruncatedSVD
        #comps = comps
        n_components = comps
        print('fitting svd')
        svd_tfidf = TruncatedSVD(n_components=n_components)
        tfidf_reduced = svd_tfidf.fit_transform(tfidf_matrix)
        del tfidf_matrix
        print('fitted svd to reduce Tfidf Matrix')
        print(tfidf_reduced.shape) 
        print(svd_tfidf.explained_variance_ratio_.sum())
    if writeSVD:
        dump(svd_tfidf, 'resources/fittedModels/fittedSVD_'
             +str(lenStr)
             +'_'
             +str(comps)
             +'.joblib')
        print('stored fitted svd to reduce Tfidf Matrix to disk')
    if readSVD:
        #lenStr = 34019
        #comps = 490
        svd_tfidf = load('resources/fittedModels/fittedSVD_'
             +str(lenStr)
             +'_'
             +str(comps)
             +'.joblib')
        print('read fitted svd to reduce Tfidf Matrix from disk')
    return vectorizer, svd_tfidf


vectorizer, svd_tfidf = getVectorizerSVDmatrix(
                            trainSVD=True,
                           writeSVD=True,
                           fitVectorizer=True,
                           writeVectorizer=True,
                           readVectorizer=False,
                           writeTfidf=False,
                           readTfidf=False,
                           readSVD=False,
                          vocabularyIdx=vocabularyIdx,
                          corpus=corpus,
                           comps=nComps,
                           lenStr=vocabSize) 

In [None]:
svd_tfidf.n_features_in_

In [None]:
plotCumVar(svd = svd_tfidf,
           modelType='tfidf')

# Specific training

In [None]:
import pandas as pd
sentencesLabeled = pd.read_csv(
    'resources/trainingData/first_semester_training_data_sentimentTopicLabeled_raw.csv',
                                    delimiter=';',
                        ).rename(
    columns={'Unnamed: 0':'sentIdx',
            #'sentence':'sentencePreprocessed',
            'label':'sentimentLabel'}).dropna(subset = ['sentenceRaw'])
print(sentencesLabeled.shape, sentencesLabeled.iloc[-1,1])

#prep func

sentencesLabeled['sentencePreprocessed'] = preprocessFunction(sentencesLabeled.sentenceRaw)

def labelCorrect(l):
    if l.lower()=='pos':
        return 'Positive'
    if l.lower()=='neut':
        return 'Neutral'
    if l.lower()=='neg':
        return 'Negative'

sentencesLabeled.sentimentLabel = [labelCorrect(l) for l in sentencesLabeled.sentimentLabel]
#lemmaReg = dict(zip(pd.read_csv('resources/fittedModels/trainedLemmatizer.csv')['index'],pd.read_csv('resources/fittedModels/trainedLemmatizer.csv')['lemmatized']))

#sentencesLabeled.sentencePreprocessed = [[lemmaReg.get(word,word) for word in document] if type(document)==list else document for document in sentencesLabeled.sentencePreprocessed]
sentencesLabeled['topicLabel'] = [item.split(', ') if type(item)==str else [''] for item in sentencesLabeled.topicLabel]
sentencesLabeled['sentimentLabel'] = [item.split(', ') if type(item)==str else [''] for item in sentencesLabeled.sentimentLabel]
sentencesLabeled['sentimentLabel'] = [[''] if item==['Neutral'] else item for item in sentencesLabeled.sentimentLabel]
trainSentencesLabeled = sentencesLabeled.loc[:round(0.8*sentencesLabeled.shape[0])].copy()
testSentencesLabeled = sentencesLabeled.loc[round(0.8*sentencesLabeled.shape[0]):].copy()
trainSentencesLabeled

In [None]:
vocabulary = vocabularyIdx

trainSentencesTFIDF = vectorizer.transform(trainSentencesLabeled.sentencePreprocessed.tolist())
trainSentences_reduced_by_svd = svd_tfidf.transform(trainSentencesTFIDF)
print(trainSentencesTFIDF.shape)
print(trainSentences_reduced_by_svd.shape)

In [None]:
testSentencesTFIDF = vectorizer.transform(testSentencesLabeled.sentencePreprocessed.tolist())
testSentences_reduced_by_svd = svd_tfidf.transform(testSentencesTFIDF)
print(testSentencesTFIDF.shape)
print(testSentences_reduced_by_svd.shape)

In [None]:
[testSentencesTFIDF] == [np.array([0])]

In [None]:
#production class

#from sklearn.neighbors import KNeighborsClassifier

class LabelModelTrainer:
    """a collection of functions to take an iterable holding 
    training labels, an iterable holding individual document 
    representations in array shape and returning a trained 
    model for label prediction"""
    def __init__(self, 
                 label='', 
                 labels='', 
                 representations=[],
                 vocabSize=16000,
                 nComps=300
                 ):#, svd1=None, a=False, svd2=None, b=False):
        from numpy import array
        from scipy.sparse import csr_matrix
        
        
        if label and type(labels)!=str and type(representations)!=list:
            self.label = label
            self.labels = labels
            self.representations = representations
            self.labelCentroid, self.othersCentroid, self.mask = self.learnCentroids() #getCentroids
            self.boolLabels = [1 if self.label in item else 0 for item in [[item] if not type(item)==list else item for item in self.labels]]
        elif label:
            self.label = label
            self.vocabSize = vocabSize
            self.nComps = nComps
            self.labelCentroid, self.othersCentroid = self.loadCentroids(self.vocabSize,
                                                                         self.nComps,
                                                                        self.label)
        else:
            pass
   

    def learnCentroids(self): #getCentroids
        from numpy import array, zeros
        from scipy.sparse import csr_matrix
        from sklearn.preprocessing import MinMaxScaler

        representations = self.representations
       
        labeled = self.labels
        
        mask = array([(self.label in item) if type(item)==list else False for item in labeled])
        if sum(mask)==0:
            labelCentroid = zeros(representations.shape[1])+99
            othersCentroid = zeros(representations.shape[1])-99
            print('no items found with mask on '+str(self.label))
        else:
            labelCentroid = representations[mask].mean(axis=0)
            othersCentroid = representations[~mask].mean(axis=0)

        return labelCentroid, othersCentroid, mask
    
    def loadCentroids(self, vocabSize, nComps, label):
        if nComps!=0:
            rep = 'svd'
        else:
            rep = 'tfidf'
        labelCentroid = load('resources/trainedClassifiers/centroids_vocabSize_'+str(vocabSize)+'_all.joblib')[rep][label]['labelCentroid']
        othersCentroid= load('resources/trainedClassifiers/centroids_vocabSize_'+str(vocabSize)+'_all.joblib')[rep][label]['othersCentroid']
        return labelCentroid, othersCentroid
        
    def getLabelScoresCosine(self, representations):
        from sklearn.preprocessing import MinMaxScaler
        import numpy as np
        if type(representations) != np.ndarray:
            representations = representations.toarray()
        labelScores = MinMaxScaler().fit_transform(np.asarray(
            representations.dot((
            self.labelCentroid - self.othersCentroid).reshape(-1,1))))
        
        self.probabilities = labelScores        

        return labelScores

    def predictCosine(self, representations, threshold=0.5):        
        labelScores = self.getLabelScoresCosine(representations)
        predictions = [[self.label] if score>threshold else [None] for score in labelScores]
        self.predictions = predictions
        self.representationsTest = representations
        return predictions
    
    def getLabelScoresEuclidean(self, representations):
        from sklearn.preprocessing import MinMaxScaler
        import numpy as np
        if type(representations) != np.ndarray:
            representations = representations.toarray()

        distances_to_label = np.linalg.norm(representations - self.labelCentroid, axis=1)
        distances_to_others = np.linalg.norm(representations - self.othersCentroid, axis=1)
        
        similarity_label = 1 / (1 + distances_to_label)
        similarity_others = 1 / (1 + distances_to_others)
        
        scores = similarity_label / (similarity_label + similarity_others)
         
        self.probabilities = scores  

        return scores
    
    def predictEuclidean(self, representations, threshold=0.5):
               
        scores = self.getLabelScoresEuclidean(representations)
        
        predictions = [[self.label] if score >= threshold else [None] for score in scores]
        self.representationsTest = representations
        self.predictions = predictions
        return predictions
    
    def trainKnn(self, k=3):
        
        from sklearn.neighbors import KNeighborsClassifier

        self.knn = KNeighborsClassifier(n_neighbors=k)
        self.knn.fit(self.representations, self.boolLabels)
    
    def predictKnn(self, representations):
        
        if not hasattr(self, 'knn'):
            raise ValueError("train knn first")

        predictions = self.knn.predict(representations)
        
        label_predictions = [[self.label] if pred == 1 else [None] for pred in predictions]
        
        return label_predictions
    
    def predictKnnProb(self, representations):
        
        if not hasattr(self, 'knn'):
            raise ValueError("train knn first")

        predictionsProb = self.knn.predict_proba(representations)[:,1]
        self.probabilities = predictionsProb        
        return predictionsProb

### Training on train set, predicting on test set

In [None]:
#production all in one training of model on training data + test of performance on test data
trainingReps =  {'tfidf':trainSentencesTFIDF, 
        'svd':trainSentences_reduced_by_svd}
testingReps = {'tfidf':testSentencesTFIDF,
              'svd':testSentences_reduced_by_svd}

predictors = ['Cosine','Euclidean','Knn']

TK = range(3,9)
#sp = split


labels = {'sentiment':['Positive','Negative'],
          'topic': ['administrativt','digitalt','eksamen','foreleser','karakter',
                    'korona','pensum','språk','undervisningsopplegget']}


iterationNames = []
numericScoresContainer = {}
for k in labels: 
    for rep in ['tfidf','svd']: 
        for predictor in predictors:
            trainSentencesLabeled = trainSentencesLabeled.copy()
            for tk in TK: 
                if predictor in ['Cosine','Euclidean']:
                    variString = '_T'
                    tkA = tk/10
                    numericScoresContainerLabel = 'predicted'+k.capitalize()+'_'+rep+'_'+predictor                
                if predictor == 'Knn':
                    variString = '_K'
                    tkA = tk
                    numericScoresContainerLabel = 'predicted'+k.capitalize()+'_'+rep+'_'+predictor+variString+str(tkA)
                
                iterationName = 'predicted'+k.capitalize()+'_'+rep+'_'+predictor+variString+str(tkA)
                iterationNames.append(iterationName)
                testSentencesLabeled[iterationName] = [[None]]*testSentencesLabeled.shape[0]

                for label in labels[k]:
                    #initialize model for the specific label and the specific reps 
                    lmt = LabelModelTrainer(label,
                                            trainSentencesLabeled[k+'Label'],  
                                            trainingReps[rep])

                    if predictor == 'Cosine':
                        predicted = lmt.predictCosine(testingReps[rep],tkA)
                        numericScoresContainer[numericScoresContainerLabel] = (lmt.getLabelScoresCosine(testingReps[rep])).ravel()
                        
                    if predictor == 'Euclidean':
                        predicted = lmt.predictEuclidean(testingReps[rep],tkA)
                        numericScoresContainer[numericScoresContainerLabel] = (lmt.getLabelScoresEuclidean(testingReps[rep])).ravel()
                        
                    if predictor == 'Knn':
                        #training:
                        lmt.trainKnn(k=tkA)
                        #predicting
                        predicted = lmt.predictKnn(testingReps[rep])
                        numericScoresContainer[numericScoresContainerLabel] = (lmt.predictKnnProb(testingReps[rep])).ravel()


                    #storing in column
                    testSentencesLabeled[iterationName] = [item+predicted[i] 
                                                    for i,item 
                                                    in 
                                                    enumerate(
                                                        testSentencesLabeled[iterationName])]


                #cleaning column - rules based for sentiment, aggregate for topics
                if k=='sentiment':
                    predicted = []#[None]*testSentencesLabeled.shape[0]
                    for cell in testSentencesLabeled[iterationName]:
                        result = 'Neutral'
                        if 'Positive' in cell:
                            if not 'Negative' in cell:
                                result = 'Positive'
                        if 'Negative' in cell:
                            if not 'Positive' in cell:
                                result = 'Negative'
                        predicted.append(result.split())
                    testSentencesLabeled[iterationName] = predicted
                if k=='topic':
                    testSentencesLabeled[iterationName] = [pd.Series(cell).dropna().values 
                                                            for cell in testSentencesLabeled[iterationName]]

testSentencesLabeled = testSentencesLabeled.replace(np.nan, '');

In [None]:
testSentencesLabeled

### Evaluating predictions on test set

In [None]:
allModelRuns = testSentencesLabeled.columns.drop(['sentIdx', 'docIdx','sentenceRaw',  'sentencePreprocessed', 'sentimentLabel',
       'topicLabel']).tolist()

topics = labels['topic']+['averageTopic']
sentiments= labels['sentiment']+['averageSentiment']
avg = ['average']
indexFrame = pd.DataFrame({'labelType':['topic']*len(topics)+['sentiment']*len(sentiments)+['average']*len(avg),
                   'label':topics+sentiments+avg})
mIndex = pd.MultiIndex.from_frame(indexFrame)

metrics = ['accuracy',
           'falseClassRate',
           'falseNassRate']
modelList = []
for item in allModelRuns:
    if 'sentiment' in item.lower():
        modelName = item.replace('Sentiment','')
    if 'topic' in item.lower():
        modelName = item.replace('Topic','')
    if not modelName in modelList:
        modelList.append(modelName)

modelsCol = []
for item in modelList:
    for i in range(len(metrics)):
        modelsCol.append(item)

metricList = []
for item in modelList:
    for i in metrics:
        metricList.append(i)
cIndex = pd.MultiIndex.from_frame(pd.DataFrame({'metrics':metricList,
                                                'model':modelsCol}))
resultsFrame = pd.DataFrame(index=cIndex,columns=mIndex).T
lookupLabels = {v:k for k,v in resultsFrame.index}

In [None]:
testSentencesLabeled.loc[2723]

In [None]:
evalParams={v:k for k,v in enumerate(['accuracy','falseClassRate','falseNassRate','precision','recall','f1score'
                                     ])}

for run in modelList: 
    if not 'Knn' in run:
        probColumn = run[:-5]
    else:
        probColumn = run
        
    for cat in labels:
        if cat=='sentiment':
            labelColumn = 'sentimentLabel'
            meanCol = 'averageSentiment'
            predColumn = run.replace('predicted_',
                                     'predictedSentiment_')
            probColumn = probColumn.replace('predicted_',
                                     'predictedSentiment_')
            
        if cat=='topic':
            labelColumn = 'topicLabel'
            meanCol = 'averageTopic'
            predColumn = run.replace('predicted_',
                                     'predictedTopic_')
            probColumn = probColumn.replace('predicted_',
                                     'predictedTopic_')
            
        
        
        labelSum = {}
        for label in labels[cat]: 
            

            labelSum[label]=[
                    #calc accuracy - i.e. ratio label present and predicted
                    (sum([bool(label in testSentencesLabeled.loc[i,predColumn] and 
                                       label in testSentencesLabeled.loc[i,labelColumn])
                      for i in testSentencesLabeled.index])+sum(
                        [bool(label not in testSentencesLabeled.loc[i,predColumn] and
                              label not in testSentencesLabeled.loc[i,labelColumn])
                         for i in testSentencesLabeled.index]
                       )
                    )/(
                        testSentencesLabeled.shape[0]),
                
                    #calc ratio of false positives - ratio of labels falsely predicted
                        sum([bool(label in testSentencesLabeled.loc[i,predColumn] 
                                                and label not in testSentencesLabeled.loc[i,labelColumn])
                              for i in testSentencesLabeled.index]
                       )/(1+sum([bool(label not in testSentencesLabeled.loc[i,labelColumn]) for i in testSentencesLabeled.index])),
                    
                
                
                    #calc ratio of false negatives - label not predicted but present in test data
                                      sum([bool(label not in testSentencesLabeled.loc[i,predColumn] 
                                                and label in testSentencesLabeled.loc[i,labelColumn])
                              for i in testSentencesLabeled.index]
                                         )/(
                                            1+sum([bool(label in testSentencesLabeled.loc[i,labelColumn])
                              for i in testSentencesLabeled.index]
                                         )),
                    #calc precision
                                    sum([bool(label in testSentencesLabeled.loc[i,labelColumn] and label in testSentencesLabeled.loc[i,predColumn]) 
                                         for i in testSentencesLabeled.index])/(1+sum(
                                        [bool(label in testSentencesLabeled.loc[i,predColumn]) for i in testSentencesLabeled.index]))
                ,
                    #calc recall
                                    sum([bool(label in testSentencesLabeled.loc[i,labelColumn] and label in testSentencesLabeled.loc[i,predColumn]) 
                                         for i in testSentencesLabeled.index])/(                                                         
                                        1+sum([bool(label in testSentencesLabeled.loc[i,labelColumn]
                                                   and label in testSentencesLabeled.loc[i,predColumn]) 
                                             for i in testSentencesLabeled.index])
                                        +sum([bool(label in testSentencesLabeled.loc[i,labelColumn]
                                                  and label not in testSentencesLabeled.loc[i,predColumn]) 
                                             for i in testSentencesLabeled.index])
                                    )
                ]
            #calc F1 Score
            labelSum[label].append(2*((labelSum[label][evalParams['precision']]*labelSum[label][evalParams['recall']])
                                      /
                                      (1+labelSum[label][evalParams['precision']]+labelSum[label][evalParams['recall']])))
                     
            
        #store in frame
        tempFrame = pd.DataFrame(labelSum,index=list(evalParams.keys()))
        
        #get means for all values accross different labels within same label category (sentiment, topic)
        tempFrame[meanCol] = tempFrame.mean(numeric_only=True,axis=1)
        tempFrame = tempFrame.T
        for L in tempFrame.index:
            for metric in tempFrame.columns:
                resultsFrame.loc[(lookupLabels[L],L),
                            (metric,run)] = tempFrame.loc[L,metric]
                resultsFrame = resultsFrame.copy()

# calc total average for each column
for col in resultsFrame.columns:
    resultsFrame.loc[('average','average'),col] = (
        resultsFrame.loc[('topic','averageTopic'),col]+resultsFrame.loc[('sentiment','averageSentiment'),col])/2

### Analysis of model results on the best performing models

In [None]:
thresholdAccuracy=0.65
thresholdFalseClass=0.35
thresholdFalseNass=0.35

filterMaskUnion = sorted(set([i for i in [item[1] for item in resultsFrame.loc[('sentiment','averageSentiment'),['f1score']].sort_values(ascending=False)[:8].index]
                                        if resultsFrame.loc[('sentiment','averageSentiment'),('accuracy',i)]>thresholdAccuracy 
                             and resultsFrame.loc[('sentiment','averageSentiment'),('falseClassRate',i)]<thresholdFalseClass
                             and resultsFrame.loc[('sentiment','averageSentiment'),('falseNassRate',i)]<thresholdFalseNass]) 
                         | 
                         set([i for i in [item[1] for item in resultsFrame.loc[('topic','averageTopic'),['f1score']].sort_values(ascending=False)[:8].index]
                                        if resultsFrame.loc[('topic','averageTopic'),('accuracy',i)]>thresholdAccuracy 
                             and resultsFrame.loc[('topic','averageTopic'),('falseClassRate',i)]<thresholdFalseClass
                             and resultsFrame.loc[('topic','averageTopic'),('falseNassRate',i)]<thresholdFalseNass]
                        )
                         |set(['predicted_tfidf_Knn_K5'])
                        )
    

plotColumns = ['accuracy','falseClassRate','falseNassRate',
               'precision','recall','f1score'
              ]
plotPredictions = [('sentiment','averageSentiment'),('topic','averageTopic')]
plotFrameSentiment = pd.DataFrame([
    resultsFrame.loc[plotPredictions[0],(col,filterMaskUnion)].values for col in plotColumns],
             columns=[item.replace('predicted_','') for item in filterMaskUnion],index=plotColumns).T
plotFrameSentiment['labelType'] = [plotPredictions[0][0]]*plotFrameSentiment.shape[0]
plotFrameSentiment = plotFrameSentiment.reset_index().set_index(['labelType','index'])

plotFrameTopic = pd.DataFrame([
    resultsFrame.loc[plotPredictions[1],(col,filterMaskUnion)].values for col in plotColumns],
             columns=[item.replace('predicted_','') for item in filterMaskUnion],index=plotColumns).T
plotFrameTopic['labelType'] = [plotPredictions[1][0]]*plotFrameTopic.shape[0]
plotFrameTopic = plotFrameTopic.reset_index().set_index(['labelType','index'])
plotFrame = pd.concat([plotFrameSentiment,plotFrameTopic])
plotFrame = plotFrame.rename(columns={'accuracy':'Accuracy',
                                      'falseClassRate':'False Positive Rate',
                                     'falseNassRate':'False Negative Rate',
                                     'precision':'Precision',
                                     'recall':'Recall',
                                     'f1score':'F1 Score'
                                     })

In [None]:
filterMaskUnion

### Visualization for interpretation of results and for figures in report

In [None]:
saveFig=True
print('Results from a vocabulary of '+str(vocabSize)
      +' words. Decomposition, where applicable, is at '
      +str(nComps)+' dimensions for the Tfidf by use of'
      'truncated SVD')
display(plotFrame)
import matplotlib.pyplot as plt
plt.style.use('tableau-colorblind10')
fig, (sent,top) = plt.subplots(2,1,figsize=(16,10),dpi=240,sharex=True)#,sharey=True)

sent = plotFrame.loc['sentiment',['Accuracy','Precision','Recall','F1 Score']].plot.barh(ax=sent, stacked=False, legend=True)
sent.set_title('Sentiment prediction results')
sent.grid(True)
sent.set_ylabel('')
top = plotFrame.loc['topic',['Accuracy','Precision','Recall','F1 Score']].plot.barh(ax=top, stacked=False, legend=False)
top.set_title('Topic prediction results')
top.grid(True)
top.set_ylabel('')
if saveFig:
    filename = 'ModelPerformances_'+str(vocabSize)+'_'+str(nComps)
    plt.savefig('results and diagrams/'+filename+'.png')
    plotFrame.to_csv('results and diagrams/'+filename+'.csv')
    print('figure and csv saved as '+filename)
plt.show()

In [None]:
plotFrame.loc['sentiment']

In [None]:
saveFig=True
fig, sent = plt.subplots(1,1,figsize=(12,5),dpi=240,sharex=True)#,sharey=True)

sent = plotFrame.loc['sentiment',['Accuracy','Precision','Recall','F1 Score']].plot.barh(ax=sent, stacked=False, legend=True)
sent.set_title('Sentiment prediction results',fontsize=20)
sent.grid(True)
sent.set_ylabel('')

if saveFig:
    filename = 'ModelPerformances_Sentiment'+str(vocabSize)+'_'+str(nComps)
    plt.savefig('results and diagrams/'+filename+'.png')
    plotFrame.to_csv('results and diagrams/'+filename+'.csv')
    print('figure and csv saved as '+filename)
from tabulate import tabulate
print(tabulate(plotFrame.loc['sentiment'],tablefmt='latex'))
plt.show()

In [None]:
saveFig=True
plt.style.use('tableau-colorblind10')
fig, top = plt.subplots(1,1,figsize=(12,5),dpi=240,sharex=True)#,sharey=True)

top = plotFrame.loc['topic',['Accuracy','Precision','Recall','F1 Score']].plot.barh(ax=top, stacked=False, legend=True)
top.set_title('Topic prediction results',fontsize=20)
top.grid(True)
top.set_ylabel('')

if saveFig:
    filename = 'ModelPerformances_Topic'+str(vocabSize)+'_'+str(nComps)
    plt.savefig('results and diagrams/'+filename+'.png')
    plotFrame.to_csv('results and diagrams/'+filename+'.csv')
    print('figure and csv saved as '+filename)
display(plotFrame.loc['sentiment'])
from tabulate import tabulate
print(tabulate(plotFrame.loc['topic'],tablefmt='latex'))
plt.show()

# Storing essentials from pre-trained models for use in analytics production pipeline

In [None]:
from joblib import dump, load

storeCentroids=True
loadCentroids=True
storeKnnIngredients=True
loadKnnIngredients=True

if storeCentroids or storeKnnIngredients:
    labelVals = []
    for key in labels:
        for label in labels[key]:
            labelVals.append(label)

    centroidContainer = {}
    knnIngredients = {}
    for rep in trainingReps:
        centroidContainer[rep]={}
        knnIngredients[rep]={}
        for lab in labelVals:
            if lab in labels['sentiment']:
                K = 'sentiment'
            if lab in labels['topic']:
                K = 'topic'
            lmt = LabelModelTrainer(lab,
                                    trainSentencesLabeled[K+'Label'], 
                                    trainingReps[rep])             
            
            knnFactors = {'representations':lmt.representations,
                         'boolLabels':lmt.boolLabels}
            
            centroidHolder = {'labelCentroid':lmt.labelCentroid ,
                             'othersCentroid':lmt.othersCentroid }
            
            centroidContainer[rep][lab] = centroidHolder
            
            knnIngredients[rep][lab] = knnFactors

    dump(centroidContainer, 'resources/trainedClassifiers/centroids_vocabSize_'+str(vocabSize)+'_all.joblib')
if loadCentroids:
    centroidContainer = load('resources/trainedClassifiers/centroids_vocabSize_'+str(vocabSize)+'_all.joblib') 
if storeKnnIngredients:
    dump(knnIngredients, 'resources/trainedClassifiers/knnIngredients_vocabSize_'+str(vocabSize)+'_all.joblib')
if loadKnnIngredients:
    knnIngredients = load('resources/trainedClassifiers/knnIngredients_vocabSize_'+str(vocabSize)+'_all.joblib')

# Production environment simulation

## steps prior to the cell below:
* ingest, clean, reshape data
* index the rows which hold comments (docIdx)
* extract only rows with comments
* split comments into sentences, keep docIdx

## Steps after the cell below:
* merge with dataframe on docIdx
* reshape dataframe so its suited for dashboarding/reports
* build reports/dashboards

In [None]:
import pandas as pd
from utils import nlp 
from tabulate import tabulate
from matplotlib import pyplot as plt
plt.rcParams["font.family"] = "Times New Roman"
plt.style.use('tableau-colorblind10')

vectorizer, svd_tfidf = nlp.loadVectorizerSVD()
sentencesLabeled = pd.read_csv(
    'resources/trainingData/first_semester_training_data_sentimentTopicLabeled_raw.csv',
                                    delimiter=';',
                        ).rename(
    columns={'Unnamed: 0':'sentIdx',
            #'sentence':'sentencePreprocessed',
            'label':'sentimentLabel'}).drop(columns=['sentimentLabel','topicLabel'])
testSentencesLabeled = sentencesLabeled.loc[round(0.8*sentencesLabeled.shape[0]):].copy()
testSentencesLabeled['sentencePreprocessed'] = nlp.preprocessFunction(testSentencesLabeled.sentenceRaw)
testSentences_reduced_by_svd = svd_tfidf.transform(vectorizer.transform(testSentencesLabeled.sentencePreprocessed))
testSentencesLabeled = testSentencesLabeled.reset_index().merge(nlp.productionPreds(testSentences_reduced_by_svd),left_index=True,right_index=True).set_index(['index']).drop(columns=['sentencePreprocessed'])
display(testSentencesLabeled)
testDocumentsScored = nlp.dfGrouper(testSentencesLabeled)
display(testDocumentsScored)
fig, (p1,p2) = plt.subplots(1,2,figsize=(10,3),dpi=240)
testDocumentsScored.sentiment_scored.plot.hist(bins=10,title='Distribution of sentiment_scored',ax=p1)
p1.set_xlabel('level of satisfaction')
p1.grid(True)
testDocumentsScored[nlp.Values().validTopics].sum(axis=0).plot.bar(title='Frequency of topics',ax=p2)
p2.grid(True)
plt.show()