# Test basic classification models for topic segmentation

In [57]:
import model.load_data as ld
import model.scoring_metrics as sm
import model_trainer_and_tester as mtt
import importlib
importlib.reload(sm)
importlib.reload(ld)
importlib.reload(sm)
importlib.reload(mtt)

import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
from itertools import combinations

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

import sys
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_columns', None)


## Splits

In [39]:
datasets = """Bed002 Bed003 Bed004 Bed005 Bed006 Bed008 Bed009 Bed010 Bed011 Bed012 Bed013 Bed014 Bed015 Bed016 Bed017 Bmr001 Bmr002 Bmr005 Bmr007 Bmr009 Bmr010 Bmr011 Bmr012 Bmr013 Bmr014 Bmr018 Bmr019 Bmr021 Bmr022 Bmr024 Bmr025 Bmr026 Bmr027 Bmr029 Bns001 Bns002 Bns003 Bro003 Bro004 Bro005 Bro007 Bro008 Bro010 Bro011 Bro012 Bro013 Bro014 Bro015 Bro016 Bro017 Bro018 Bro019 Bro021 Bro022 Bro023 Bro024 Bro025 Bro026 Bro027 Bro028 Bsr001 Btr001 Btr002""".split(" ")
results_merged_path = "../results_merged_f0_stds_fixed/"

all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
split=0.3

# Represents the context that is being used for training and for evaluation
shifts = [-3,-2,-1, 1,2,3]

subtopic_lvl = 3

### Basic random split

In [37]:
X_train, y_train, X_test, y_test = ld.train_test_split(datasets,results_merged_path,split)

### Basic random split + context

In [40]:
X_train, y_train, X_test, y_test = mtt.read_in_dataset_all_together(features_selected,shifts,split,subtopic_lvl)

FileNotFoundError: [Errno 2] No such file or directory: '../topic_boundaries/Bro016_topic_boundaries_lvl_3.csv'

### Constant split + context (to compare with biLSTM)

In [58]:
X_train_const, y_train_const = mtt.read_in_dataset(features_selected, shifts, to_read='train')

clf = DecisionTreeClassifier()
clf.fit(X_train_const, y_train_const)

results_const = mtt.test_set_evaluate_multiple(clf, features_selected, shifts)

In [59]:
results_const.mean()

Pk         0.572100
K-k       -0.066113
Windiff    0.572100
dtype: float64

## Test on commonly used models for topic segmentation

In [20]:
def print_eval(y_pred,y_true):
    k = int(max(1,np.floor((len(y_true)+1)/(2*(sum(y_true)+1)))))
    print('k =',k)

    print('- windiff:',sm.get_windiff(np.array(y_true),np.array(y_pred),k))
    print('- pk:',sm.get_pk(np.array(y_true),np.array(y_pred),k))
    print('- kkappa:',sm.get_k_kappa(np.array(y_true),np.array(y_pred),k))

### Decision Tree classifier with/without hyperparameter tuning

In [8]:
def DecTree(X_train, X_test, y_train,tuning=False, best_criterion=None,best_max_depth=None,best_min_sample_leaf=None):
    if tuning:
        clf = DecisionTreeClassifier(criterion=best_criterion,max_depth=best_max_depth,min_samples_leaf=best_min_sample_leaf)
    else:
        clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)

    return y_predicted

In [62]:
def DecTree_hyperparam(X,y):
    
    clf = DecisionTreeClassifier()
    std_slc = StandardScaler()
    pipe = Pipeline(steps=[('std_slc', std_slc),('dec_tree', clf)])

    n_components = list(range(1,X.shape[1]+1,1))

    criterion = ['gini', 'entropy','log_loss']
    max_depth = [2,5,10,15,20,30,50,100]
    min_samples_leaf=[5,10,20,50,100]

    parameters = dict(dec_tree__criterion=criterion,
                      dec_tree__max_depth=max_depth,
                      dec_tree__min_samples_leaf=min_samples_leaf)

    clf_GS = GridSearchCV(pipe, parameters)
    clf_GS.fit(X, y)

    best_criterion = clf_GS.best_estimator_.get_params()['dec_tree__criterion']
    best_max_depth = clf_GS.best_estimator_.get_params()['dec_tree__max_depth']
    best_min_samples_leaf = clf_GS.best_estimator_.get_params()['dec_tree__min_samples_leaf']

    print('Best criterion:', best_criterion)
    print('Best max_depth:', best_max_depth)
    print('Best min_sample_leaf:', best_min_samples_leaf)

    return best_criterion,best_max_depth,best_min_samples_leaf

In [21]:
best_criterion,best_max_depth,best_min_sample_leaf = DecTree_hyperparam(X_train,y_train)

print("-------------Tuned DST")
DT_y_predicted = DecTree(X_train,X_test,y_train,True,best_criterion,best_max_depth,best_min_sample_leaf)
print(sum(DT_y_predicted))
print_eval(DT_y_predicted,y_test)

NameError: name 'DecTree_hyperparam' is not defined

In [23]:
print("-------------Normal DST")
DT_y_predicted = DecTree(X_train,X_test,y_train,False)
print(sum(DT_y_predicted),sum(y_test))
print_eval(DT_y_predicted,y_test)
indexes = [index for index in range(len(DT_y_predicted)) if (DT_y_predicted[index]==1)and(y_test[index]==1)]

-------------Normal DST
417 312
k = 44
- windiff: 0.5588575560375995
- pk: 0.47736804049168474
- kkappa: 0.03370331277192754


### SVM with/without hyperparameter tuning

In [9]:
def SVM(X_train,X_test,y_train,tuning=False,best_gamma=None,best_C=None):
    if tuning:
        clf = SVC(gamma=best_gamma,C=best_C) 
    else:
        clf = SVC() 
    clf.fit(X_train, y_train) 
    y_predicted = clf.predict(X_test)
    
    return y_predicted

In [None]:
def SVM_hyperparam(X,y):
    param_grid = {
        "gamma": [0.1, 1.0, 10],
        "C": [0.1, 1.0, 10]
    }

    grid_search = GridSearchCV(SVC(), param_grid, cv=2, verbose=1, n_jobs=-1)
    grid_search.fit(X, y)

    best_gamma = grid_search.best_estimator_.get_params()['gamma']
    best_C = grid_search.best_estimator_.get_params()['C']
    print('Best gamma:', best_gamma)
    print('Best C:', best_C)

    return best_gamma,best_C

In [None]:
best_gamma,best_C = SVM_hyperparam(X_train,y_train)

print("-------------Tuned SVM")
SVM_y_predicted = SVM(X_train,X_test,y_train,True,best_gamma,best_C)
print(sum(SVM_y_predicted))
print_eval(SVM_y_predicted,y_test)



Fitting 2 folds for each of 9 candidates, totalling 18 fits
Best gamma: 0.1
Best C: 0.1


In [41]:
print("-------------Normal SVM")
SVM_y_predicted = SVM(X_train,X_test,y_train,False)
print(sum(SVM_y_predicted),sum(y_test))
print_eval(SVM_y_predicted,y_test)

-------------Normal SVM
0.0 288.0
k = 46
- windiff: 0.4033554051037869
- pk: 0.4033554051037869
- kkappa: 0.0


### Naive Bayes for binary class with/without hyperparameter tuning

In [10]:
def naiveBayes(X_train,X_test,y_train,tuning=False,best_var_smoothing=None):

    if tuning:
        clf = GaussianNB(var_smoothing=best_var_smoothing) 
    else:
        clf = GaussianNB() 

    clf.fit(X_train, np.ravel(y_train)) 
    y_predicted = clf.predict(X_test)
    
    return y_predicted

In [None]:
def naiveBayes_hyperparam(X,y):
        param_grid_nb = {'var_smoothing': np.logspace(0,-9, num=100)}
        nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
        nbModel_grid.fit(X, y)

        best_var_smoothing = nbModel_grid.best_estimator_.get_params()['var_smoothing']
        print('Best var_smoothing:', best_var_smoothing)

        return best_var_smoothing

In [70]:
best_var_smoothing = naiveBayes_hyperparam(X_train,y_train)

print("-------------Tuned NB")
NB_y_predicted = naiveBayes(X_train,X_test,y_train,True,best_var_smoothing)
print(sum(NB_y_predicted))
print_eval(NB_y_predicted,y_test)

NameError: name 'naiveBayes_hyperparam' is not defined

In [30]:
print("-------------Normal NB")
NB_y_predicted = naiveBayes(X_train,X_test,y_train,False)
print(sum(NB_y_predicted),sum(y_test))
print_eval(NB_y_predicted,y_test)

-------------Normal NB
216 98
k = 121
- windiff: 0.5901898734177216
- pk: 0.4794720186542305
- kkappa: 0.010325750790288572


## Multiple runs of each algo with different random splits

In [59]:
def get_eval(y_pred,y_true):
    k = int(max(1,np.floor((len(y_true)+1)/(2*(sum(y_true)+1)))))

    windiff = sm.get_windiff(np.array(y_true),np.array(y_pred),k)
    pk = sm.get_pk(np.array(y_true),np.array(y_pred),k)
    kappa = sm.get_k_kappa(np.array(y_true),np.array(y_pred),k)

    return windiff,pk,kappa

def get_avg_eval(eval):
    windiffs = [row[0] for row in eval]
    pks = [row[1] for row in eval]
    kappas = [row[2] for row in eval]

    print('-> windiff:',windiffs)
    print('-> pk:',pks)
    print('-> k-kappa:',kappas)
    print()
    print('-> windiff - mean:',statistics.mean(windiffs),', - var:',statistics.variance(windiffs))
    print('-> pk - mean:',statistics.mean(pks),', - var:',statistics.variance(pks))
    print('-> k-kappa - mean:',statistics.mean(kappas),', - var:',statistics.variance(kappas))


def run_model(iterations,features,shift,split,subtopic_lvl):
    DT_scores = []
    # SVM_scores = []
    NB_scores = []
    for k in range(iterations):
        X_train, y_train, X_test, y_test = mtt.read_in_dataset_all_together(features,shifts,split,subtopic_lvl)
        
        DT_scores.append(get_eval(DecTree(X_train,X_test,y_train,False),y_test))
        # SVM_scores.append(get_eval(SVM(X_train,X_test,y_train,False),y_test))
        NB_scores.append(get_eval(naiveBayes(X_train,X_test,y_train,False),y_test))
    
    return DT_scores,NB_scores
    
    

### Test different context levels

In [60]:
all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
split = 0.3
subtopic_lvl = 2

shifts = [-10,-9,-8,-7,-6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6,7,8,9,10]

iterations = 1
print('EXPERIMENTING ON CONTEXT LEVEL WITH SPLIT:',split,'(of testing), ITERATIONS:',iterations,', SUBTOPIC LVL:',subtopic_lvl,', AND FEATURES:',features_selected,)
i=0
while i<len(shifts)/2:
    # get current shift
    curr_shifts=[]
    for j in range(i,len(shifts)-i):
        curr_shifts.append(shifts[j])
    
    DT_scores, NB_scores = run_model(iterations,features_selected,curr_shifts,split,subtopic_lvl)
    
    print('-> Context level =',len(curr_shifts)/2)
    print('----- DT:')
    get_avg_eval(DT_scores)
    print()
    # print('----- SVM:')
    # get_avg_eval(SVM_scores)
    print()
    print('----- NB:')
    get_avg_eval(NB_scores)
    print()
    print()
        
    i+=1

EXPERIMENTING ON CONTEXT LEVEL WITH SPLIT: 0.3 (of testing), ITERATIONS: 1 , SUBTOPIC LVL: 2 , AND FEATURES: ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']


KeyboardInterrupt: 

### Test different subtopic levels

In [61]:
all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
split = 0.3
shifts = [-2, -1, 1, 2]

subtopic_lvls = [0,1,2]

iterations = 5
print('EXPERIMENTING ON SUBTOPIC LEVEL WITH SPLIT:',split,'(of testing), ITERATIONS:',iterations,', SHIFTS:',shifts,', AND FEATURES:',features_selected,)

for i in range(len(subtopic_lvls)):
    curr_subtopic_lvl = subtopic_lvls[i]

    DT_scores, NB_scores = run_model(iterations,features_selected,shifts,split,curr_subtopic_lvl)
    
    print('-> Subtopic level =',curr_subtopic_lvl)
    print('----- DT:')
    get_avg_eval(DT_scores)
    print()
    # print('----- SVM:')
    # get_avg_eval(SVM_scores)
    print()
    print('----- NB:')
    get_avg_eval(NB_scores)
    print()
    print()

EXPERIMENTING ON SUBTOPIC LEVEL WITH SPLIT: 0.3 (of testing), ITERATIONS: 5 , SHIFTS: [-2, -1, 1, 2] , AND FEATURES: ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']


KeyboardInterrupt: 

### Test different combinations of features

In [58]:
datasets = """Bed002 Bed003 Bed004 Bed005 Bed006 Bed008 Bed009 Bed010 Bed011 Bed012 Bed013 Bed014 Bed015 Bed016 Bed017 Bmr001 Bmr002 Bmr005 Bmr007 Bmr009 Bmr010 Bmr011 Bmr012 Bmr013 Bmr014 Bmr018 Bmr019 Bmr021 Bmr022 Bmr024 Bmr025 Bmr026 Bmr027 Bmr029 Bns001 Bns002 Bns003 Bro003 Bro004 Bro005 Bro007 Bro008 Bro010 Bro011 Bro012 Bro013 Bro014 Bro015 Bro016 Bro017 Bro018 Bro019 Bro021 Bro022 Bro023 Bro024 Bro025 Bro026 Bro027 Bro028 Bsr001 Btr001 Btr002""".split(" ")
results_merged_path = "../results_merged_f0_stds_fixed/"

all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']

split = 0.3
shifts = [-2, -1, 1, 2]
subtopic_lvl = 1

iterations = 2

print('EXPERIMENTING ON FEATURES COMBINATIONS WITH SPLIT:',split,'(of testing), ITERATIONS:',iterations,', SHIFTS:',shifts,', AND SUBTOPIC LVL:',subtopic_lvl)
for i in range(1,len(all_features)+1):
    print('------ALL COMBINATIONS OF LENGTH',i,'------')
    features_combinations = combinations(all_features,i)
    for feature_comb in features_combinations:
        feature_comb = list(feature_comb)

        DT_scores, NB_scores = run_model(iterations,feature_comb,shifts,split,subtopic_lvl)
        
        print('-> Features =',feature_comb)
        print('----- DT:')
        get_avg_eval(DT_scores)
        print()
        # print('----- SVM:')
        # get_avg_eval(SVM_scores)
        print()
        print('----- NB:')
        get_avg_eval(NB_scores)
        print()
        print()
    print()



EXPERIMENTING ON FEATURES COMBINATIONS WITH SPLIT: 0.3 (of testing), ITERATIONS: 2 , SHIFTS: [-2, -1, 1, 2] , AND SUBTOPIC LVL: 1
------ALL COMBINATIONS OF LENGTH 1 ------
-> Features = ['pause']
----- DT:
-> windiff: [0.5622854340362923, 0.5340374763141273]
-> pk: [0.47218524486793245, 0.464804547687557]
-> k-kappa: [0.0594864010078124, 0.03817162002451795]

-> windiff - mean: 0.5481614551752099 , - var: 0.00039897355773660864
-> pk - mean: 0.46849489627774477 , - var: 2.72373454342009e-05
-> k-kappa - mean: 0.04882901051616517 , - var: 0.0002271599441829054


----- NB:
-> windiff: [0.5265536327331325, 0.45368096006737313]
-> pk: [0.4567715266587263, 0.400835146326058]
-> k-kappa: [0.0043759265917455775, 0.07519984148327805]

-> windiff - mean: 0.4901172964002528 , - var: 0.0026552132107254558
-> pk - mean: 0.4288033364923921 , - var: 0.0015644393223604599
-> k-kappa - mean: 0.03978788403751181 , - var: 0.0025080134602815172


-> Features = ['speakerChange']
----- DT:
-> windiff: [0.3

KeyboardInterrupt: 

## GridSearch for combinations of features, context lvl, subtopic lvl

In [67]:
all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
split = 0.3
iterations = 10

subtopic_lvls = [0,1,2]
shifts = [-10,-9,-8,-7,-6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6,7,8,9,10]

print('EXPERIMENTING ON ALL PARAMETERS (combinations of features, context lvl (shifts), subtopic lvl) WITH SPLIT:',split,'(of testing), ITERATIONS:',iterations)

for i in range(1,len(all_features)+1):
    features_combinations = combinations(all_features,i)
    for feature_comb in features_combinations:
        feature_comb = list(feature_comb)
        print('------FEATURES:',feature_comb,'------')

        j=0
        while j<len(shifts)/2:
            # get current shift
            curr_shifts=[]
            for k in range(j,len(shifts)-j):
                curr_shifts.append(shifts[k])
            print('------SHIFTS:',curr_shifts,'------')
            
            for l in range(len(subtopic_lvls)):
                curr_subtopic_lvl = subtopic_lvls[l]
                print('------SUBTOPIC LVL:',curr_subtopic_lvl,'------')

                DT_scores, NB_scores = run_model(iterations,feature_comb,curr_shifts,split,curr_subtopic_lvl)
                
                print('----- DT:')
                get_avg_eval(DT_scores)
                print()
                print('----- NB:')
                get_avg_eval(NB_scores)
                print()
            j+=1
            print()
    print()

EXPERIMENTING ON ALL PARAMETERS (combinations of features, context lvl (shifts), subtopic lvl) WITH SPLIT: 0.3 (of testing), ITERATIONS: 10
------FEATURES: ['pause'] ------
------SHIFTS: [-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] ------
------SUBTOPIC LVL: 0 ------
----- DT:
-> windiff: [0.5233619344773791, 0.5031118776244751, 0.5547329250329482, 0.5772557172557172, 0.5659848365187174, 0.5385362694300518, 0.5791221542031769, 0.551570423687224, 0.5617176297329768, 0.5434145282131094]
-> pk: [0.45842433697347895, 0.4361127774445111, 0.4862004806574153, 0.4907692307692308, 0.4765834781235192, 0.46377104922279794, 0.5088224702916817, 0.49799607394078194, 0.4771925745068403, 0.46002388794382715]
-> k-kappa: [0.004865370229170394, 0.057704898741927545, -0.0016739910559795108, -0.02758593756725326, 0.05374553172494793, 0.031682726798075024, -0.05921612210242258, -0.06650699292914379, 0.04474415963341396, 0.039273906957996284]

-> windiff - mean: 0.549880829617577

KeyboardInterrupt: 

#### To read the file of the output of the code 
Please add the file in the project folder

In [8]:
path = (os.path.realpath(os.path.join(os.getcwd(), (f"simple_models.txt"))))
file = open(path, 'r')
lines = file.readlines()

combination=['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']

for i in range(len(lines)):
    if str(combination) in lines[i]:
        DT_windiff=lines[i+2][12:-1].strip('][').split(', ')
        DT_pk=lines[i+3][7:-1].strip('][').split(', ')
        DT_kappa=lines[i+4][12:-1].strip('][').split(', ')

        NB_windiff=lines[i+6][12:-1].strip('][').split(', ')
        NB_pk=lines[i+7][7:-1].strip('][').split(', ')
        NB_kappa=lines[i+8][12:-1].strip('][').split(', ')


['0.10658901085020546', '0.10266616909897681', '-0.04659279581705039', '0.029284132290540693', '0.12114758465146477', '0.05468937802760812', '-0.04198029181586795', '0.0025401207878630134', '0.1894085782008286', '0.07655725318534783']
