In [39]:
import numpy as np
import json
from pathlib import Path
import scipy
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
from progress.bar import Bar
import random
import torch
from ipywidgets import IntProgress
from IPython.display import display
from torchmetrics.classification import BinaryF1Score
from sklearn.metrics import f1_score
from tqdm import tqdm
from funkisar import *
from sklearn.decomposition import PCA

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Device:', device)
torch.set_default_tensor_type('torch.cuda.FloatTensor')

path_to_training = Path("training")
path_to_test = Path("test")

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

torch.cuda.is_available()


Device: cuda:0


True

In [38]:
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
validation_set=['IS1000', 'ES2002']
for thing in validation_set:
    training_set.remove(thing)

#training_set=['ES2005']

In [43]:

def get_Xtraining(collection):
    missingSet=['IS1002a', 'IS1005d', 'TS3012c']

    collection = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in collection])
    
    for transcription_id in missingSet:
        try:
            collection.remove(transcription_id)
        except:
            pass

    convolinesList= get_convolineList(collection)
    bert = SentenceTransformer('all-mpnet-base-v2', device=device)
    bert.to("cuda")

    print('encoding')
    pbar = tqdm(total=len(convolinesList), desc="Encoding")
    X_training=np.zeros((768, 1))
    for data in convolinesList:
        branchesList= data[0]
        seentenceList=data[2]
        matrix=data[4]
        #print(matrix)
        #exit()

        lastindex=branchesList[-1][-1]
        

        for index in range(lastindex+1):
            convoIndex=[]
            placeholder=index
            for i in range(5):
                if placeholder==0:
                    break
                search=matrix[placeholder,:]
                prev=np.where(search==1)
                prev=prev[0][0]
                convoIndex.append(prev)
                placeholder=prev

            convo=''
            for i in reversed(convoIndex):
                convo=convo+seentenceList[i]+' '

            #print('prevconvo: ', convo,'sentence: ', seentenceList[index])
            
            #print(convo, seentenceList[index])
            encoded=bert.encode([seentenceList[index], convo])

            encoded=np.array(encoded[0]+encoded[1])
            encoded=np.reshape(encoded, (-1, 1))

            #print(X_training.shape, encoded.shape)
            
            X_training=np.append(X_training, encoded, axis=1)
  
      
        pbar.update(1)   
    pbar.close()

    return (X_training.T)

torch.cuda.is_available()
        

True

In [41]:
def get_PCA(matrix, dim):
    matrix = (matrix - matrix.mean(axis = 0)) / matrix.std(axis = 0)

    pca=PCA(n_components=dim)

    return pca.transform(matrix)
      

In [44]:
X_training= get_Xtraining(training_set)
X_validation=get_Xtraining(validation_set)

X_trainingCut=X_training[:,0:384]
X_validationCut=X_validation[:,0:384]

encoding


Encoding:  43%|████▎     | 38/89 [32:59<1:21:01, 95.33s/it] 

KeyboardInterrupt: 

In [28]:
def get_ylabels(set):

    missingSet=['IS1002a', 'IS1005d', 'TS3012c']

    set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in set])
    
    for transcription_id in missingSet:
        try:
            set.remove(transcription_id)
        except:
            pass

    with open("training_labels.json", "r") as file:
        training_labels = json.load(file)
    y=[]
    for transcription_id in set:
        y += training_labels[transcription_id]
    
    return y

In [36]:

X_training1=np.delete(X_training, 0, axis=0)
X_trainingCut1=np.delete(X_trainingCut, 0, axis=0)
X_validation1=np.delete(X_validation, 0, axis=0)
X_validationCut1=np.delete(X_validationCut, 0, axis=0)

y_training=get_ylabels(training_set)
y_validation=get_ylabels(validation_set)

print(X_training1.shape)
print(len(y_training))

X_trainingPCA=get_PCA(X_training1, 150)
X_validationPCA=get_PCA(X_validation1, 150)
X_trainingCutPCA=get_PCA(X_trainingCut1, 150)
X_validationCutPCA=get_PCA(X_validationCut1, 150)


None


AxisError: axis 0 is out of bounds for array of dimension 0

In [None]:
def get_f1score(X_training, y_training, X_validation, y_validation):
    bst=XGBClassifier()
    bst.fit(X_training, y_training)

    preds=bst.predict(X_validation)
    score=sklearn.metrics.f1_score(y_validation, preds)
    return score

In [None]:
print(get_f1score(X_training, y_training, X_validation, y_validation))
print(get_f1score(X_trainingCut, y_training, X_validationCut, y_validation))