# Test basic classification models for topic segmentation

In [1]:
import model.load_data as ld
import model.scoring_metrics as sm
import model_trainer_and_tester as mtt
import importlib
importlib.reload(sm)
importlib.reload(ld)
importlib.reload(sm)
importlib.reload(mtt)

import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
from itertools import combinations

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

import sys
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_columns', None)


## Splits

In [28]:
all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']

### Jan's "basic" split

In [29]:
datasets = """Bed002 Bed003 Bed004 Bed005 Bed006 Bed008 Bed009 Bed010 Bed011 Bed012 Bed013 Bed014 Bed015 Bed016 Bed017 Bmr001 Bmr002 Bmr005 Bmr007 Bmr009 Bmr010 Bmr011 Bmr012 Bmr013 Bmr014 Bmr018 Bmr019 Bmr021 Bmr022 Bmr024 Bmr025 Bmr026 Bmr027 Bmr029 Bns001 Bns002""".split(" ")
results_merged_path = "../results_merged_fixedf0_lvl/"

X_train, y_train, X_test, y_test = ld.train_test_split(datasets,results_merged_path,0.3)

X_train = X_train[features_selected]
X_test = X_test[features_selected]

### Nic's "constant" split

In [30]:
# Represents the context that is being used for training and for evaluation
shifts = [-2, -1, 1, 2]

X_train_const, y_train_const = mtt.read_in_dataset(features_selected, shifts, to_read='train')

clf = DecisionTreeClassifier()
clf.fit(X_train_const, y_train_const)

results_const = mtt.test_set_evaluate_multiple(clf, features_selected, shifts)

In [31]:
results_const.mean()

Pk         0.573975
K-k       -0.070905
Windiff    0.573975
dtype: float64

## Test on commonly used models for topic segmentation

In [30]:
def print_eval(y_pred,y_true):
    k = int(max(1,np.floor((len(y_true)+1)/(2*(sum(y_true)+1)))))
    print('k =',k)

    int_y_pred = (np.array(y_pred))
    int_y_true = (np.array(y_true))

    print('- windiff:',sm.get_windiff(int_y_true,int_y_pred,k))
    print('- pk:',sm.get_pk(int_y_true,int_y_pred,k))
    print('- kkappa:',sm.get_k_kappa(int_y_true,int_y_pred,k))

### Decision Tree classifier with/without hyperparameter tuning

In [2]:
def DecTree(X_train, X_test, y_train,tuning=False, best_criterion=None,best_max_depth=None,best_min_sample_leaf=None):
    if tuning:
        clf = DecisionTreeClassifier(criterion=best_criterion,max_depth=best_max_depth,min_samples_leaf=best_min_sample_leaf)
    else:
        clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)

    return y_predicted

In [32]:
def DecTree_hyperparam(X,y):
    
    clf = DecisionTreeClassifier()
    std_slc = StandardScaler()
    pipe = Pipeline(steps=[('std_slc', std_slc),('dec_tree', clf)])

    n_components = list(range(1,X.shape[1]+1,1))

    criterion = ['gini', 'entropy','log_loss']
    max_depth = [2,5,10,15,20,30,50,100]
    min_samples_leaf=[5,10,20,50,100]

    parameters = dict(dec_tree__criterion=criterion,
                      dec_tree__max_depth=max_depth,
                      dec_tree__min_samples_leaf=min_samples_leaf)

    clf_GS = GridSearchCV(pipe, parameters)
    clf_GS.fit(X, y)

    best_criterion = clf_GS.best_estimator_.get_params()['dec_tree__criterion']
    best_max_depth = clf_GS.best_estimator_.get_params()['dec_tree__max_depth']
    best_min_samples_leaf = clf_GS.best_estimator_.get_params()['dec_tree__min_samples_leaf']

    print('Best criterion:', best_criterion)
    print('Best max_depth:', best_max_depth)
    print('Best min_sample_leaf:', best_min_samples_leaf)

    return best_criterion,best_max_depth,best_min_samples_leaf

In [33]:
best_criterion,best_max_depth,best_min_sample_leaf = DecTree_hyperparam(X_train,y_train)

Best criterion: gini
Best max_depth: 2
Best min_sample_leaf: 5


In [34]:
print("-------------Normal DST")
DT_y_predicted = DecTree(X_train,X_test,y_train,False)
print(sum(DT_y_predicted))
print_eval(DT_y_predicted,y_test)

print("-------------Tuned DST")
DT_y_predicted = DecTree(X_train,X_test,y_train,True,best_criterion,best_max_depth,best_min_sample_leaf)
print(sum(DT_y_predicted))
print_eval(DT_y_predicted,y_test)

-------------Normal DST
23.0
k = 61
- windiff: 0.3716451959205582
- pk: 0.35165056360708535
- kkappa: 0.1170771803750981
-------------Tuned DST
0.0
k = 61
- windiff: 0.3778180354267311
- pk: 0.3778180354267311
- kkappa: 0.0


### SVM with/without hyperparameter tuning

In [36]:
def SVM(X_train,X_test,y_train,tuning=False,best_gamma=None,best_C=None):
    if tuning:
        clf = SVC(gamma=best_gamma,C=best_C) 
    else:
        clf = SVC() 
    clf.fit(X_train, y_train) 
    y_predicted = clf.predict(X_test)
    
    return y_predicted

In [13]:
def SVM_hyperparam(X,y):
    param_grid = {
        "gamma": [0.1, 1.0, 10],
        "C": [0.1, 1.0, 10]
    }

    grid_search = GridSearchCV(SVC(), param_grid, cv=2, verbose=1, n_jobs=-1)
    grid_search.fit(X, y)

    best_gamma = grid_search.best_estimator_.get_params()['gamma']
    best_C = grid_search.best_estimator_.get_params()['C']
    print('Best gamma:', best_gamma)
    print('Best C:', best_C)

    return best_gamma,best_C

In [14]:
best_gamma,best_C = SVM_hyperparam(X_train,y_train)



Fitting 2 folds for each of 9 candidates, totalling 18 fits
Best gamma: 0.1
Best C: 0.1


In [11]:
print("-------------Normal SVM")
SVM_y_predicted = SVM(X_train,X_test,y_train,False)
print(sum(SVM_y_predicted))
print_eval(SVM_y_predicted,y_test)

print("-------------Tuned SVM")
SVM_y_predicted = SVM(X_train,X_test,y_train,True,best_gamma,best_C)
print(sum(SVM_y_predicted))
print_eval(SVM_y_predicted,y_test)

-------------Normal SVM
0.0
k = 58
- windiff: 0.39771241830065357
- pk: 0.39771241830065357
- kkappa: 0.0
-------------Tuned SVM
0.0
k = 58
- windiff: 0.39771241830065357
- pk: 0.39771241830065357
- kkappa: 0.0


### Naive Bayes for binary class with/without hyperparameter tuning

In [3]:
def naiveBayes(X_train,X_test,y_train,tuning=False,best_var_smoothing=None):

    if tuning:
        clf = GaussianNB(var_smoothing=best_var_smoothing) 
    else:
        clf = GaussianNB() 

    clf.fit(X_train, y_train) 
    y_predicted = clf.predict(X_test)
    
    return y_predicted

In [25]:
def naiveBayes_hyperparam(X,y):
        param_grid_nb = {'var_smoothing': np.logspace(0,-9, num=100)}
        nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
        nbModel_grid.fit(X, y)

        best_var_smoothing = nbModel_grid.best_estimator_.get_params()['var_smoothing']
        print('Best var_smoothing:', best_var_smoothing)

        return best_var_smoothing

In [26]:
best_var_smoothing = naiveBayes_hyperparam(X_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best var_smoothing: 1.0


In [28]:
print("-------------Normal NB")
NB_y_predicted = naiveBayes(X_train,X_test,y_train,False)
print(sum(NB_y_predicted))
print_eval(NB_y_predicted,y_test)

print("-------------Tuned NB")
NB_y_predicted = naiveBayes(X_train,X_test,y_train,True,best_var_smoothing)
print(sum(NB_y_predicted))
print_eval(NB_y_predicted,y_test)

-------------Normal NB
167.0
k = 48
- windiff: 0.4680751799520128
- pk: 0.3826312983204479
- kkappa: 0.17552406762815045
-------------Tuned NB
37.0
k = 48
- windiff: 0.42721940815782455
- pk: 0.4073580378565716
- kkappa: 0.039813012811309854


## Multiple runs of each algo with different splits

In [4]:
def get_eval(y_pred,y_true):
    k = int(max(1,np.floor((len(y_true)+1)/(2*(sum(y_true)+1)))))

    windiff = sm.get_windiff(np.array(y_true),np.array(y_pred),k)
    pk = sm.get_pk(np.array(y_true),np.array(y_pred),k)
    kappa = sm.get_k_kappa(np.array(y_true),np.array(y_pred),k)

    return windiff,pk,kappa

def get_avg_eval(eval):
    windiffs = [row[0] for row in eval]
    pks = [row[1] for row in eval]
    kappas = [row[2] for row in eval]

    print('-> windiff:',windiffs)
    print('-> pk:',pks)
    print('-> k-kappa:',kappas)

    # print('-> windiff - mean:',statistics.mean(windiffs),', - var:',statistics.variance(windiffs))
    # print('-> pk - mean:',statistics.mean(pks),', - var:',statistics.variance(pks))
    # print('-> k-kappa - mean:',statistics.mean(kappas),', - var:',statistics.variance(kappas))

def get_summary(DT_all,NB_all,features_selected):
    print('-> Features =',features_selected)
    print('----- DT:')
    get_avg_eval(DT_all)
    print('----- Naive Bayes:')
    get_avg_eval(NB_all)

In [6]:
datasets = """Bed002 Bed003 Bed004 Bed005 Bed006 Bed008 Bed009 Bed010 Bed011 Bed012 Bed013 Bed014 Bed015 Bed016 Bed017 Bmr001 Bmr002 Bmr005 Bmr007 Bmr009 Bmr010 Bmr011 Bmr012 Bmr013 Bmr014 Bmr018 Bmr019 Bmr021 Bmr022 Bmr024 Bmr025 Bmr026 Bmr027 Bmr029 Bns001 Bns002""".split(" ")
results_merged_path = "../results_merged_fixedf0/"

all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']

split = 0.3
iterations = 10

print('EXPERIMENTING ON DTC AND NB WITH SPLIT',split,'(of testing) AND',iterations,'ITERATIONS PER MODEL TESTING')
for i in range(1,len(all_features)+1):
    print('------ALL COMBINATIONS OF LENGTH',i,'------')
    features_combinations = combinations(all_features,i)
    for feature_comb in features_combinations:
        feature_comb = list(feature_comb)
        DT_scores = []
        NB_scores = []
        for j in range(iterations):
            X_train, y_train, X_test, y_test = ld.train_test_split(datasets,results_merged_path,split)

            X_train = X_train[feature_comb]
            X_test = X_test[feature_comb]
            
            DT_scores.append(get_eval(DecTree(X_train,X_test,y_train,False),y_test))
            NB_scores.append(get_eval(naiveBayes(X_train,X_test,y_train,False),y_test))
        
        get_summary(DT_scores,NB_scores,feature_comb)
        print()
    print()



EXPERIMENTING ON DTC AND NB WITH SPLIT 0.3 (of testing) AND 10 ITERATIONS PER MODEL TESTING
------ALL COMBINATIONS OF LENGTH 1 ------
-> Features = ['pause']
----- DT:
-> windiff: [0.3691443673604229, 0.40075178053284094, 0.41973596961842186, 0.41562902533276086, 0.4127805402499664, 0.4121193666260658, 0.4207356983898416, 0.40034629893018364, 0.3721383014434374, 0.4046204620462046]
-> pk: [0.3588371324743971, 0.3936955948298602, 0.4120802941708361, 0.3953873520211004, 0.3901357344442951, 0.40133982947624847, 0.41314308155517737, 0.3945952631253479, 0.35824102047667, 0.3934653465346535]
-> k-kappa: [0.07221862921609884, 0.03404547870644593, -0.04019169647798376, -0.0005250871846378609, 0.0688368838347317, 0.03132910615058468, -0.02006955722460435, 0.030480545123441068, 0.059758429937738174, 0.03356049744567974]
----- Naive Bayes:
-> windiff: [0.406012553683515, 0.4570034291743603, 0.41629995780336365, 0.5651107158191744, 0.4372396183308695, 0.4419001218026797, 0.44868438277261424, 0.469

#### To read the file of the output of the code 
Please add the file in the project folder

In [38]:
path = (os.path.realpath(os.path.join(os.getcwd(), (f"simple_models.txt"))))
file = open(path, 'r')
lines = file.readlines()

combination=['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']

for i in range(len(lines)):
    if str(combination) in lines[i]:
        DT_windiff=lines[i+2][12:-1].strip('][').split(', ')
        DT_pk=lines[i+3][7:-1].strip('][').split(', ')
        DT_kappa=lines[i+4][12:-1].strip('][').split(', ')

        NB_windiff=lines[i+6][12:-1].strip('][').split(', ')
        NB_pk=lines[i+7][7:-1].strip('][').split(', ')
        NB_kappa=lines[i+8][12:-1].strip('][').split(', ')


['0.5068850229500765', '0.574698402347571', '0.4960989156307855', '0.5528111600942086', '0.5143441313654079', '0.5543728423475259', '0.5456642953278216', '0.5395055744062045', '0.5091479700854701', '0.5137648809523809']
