# Test basic classification models for topic segmentation

In [28]:
import model.load_data as ld
import model.scoring_metrics as sm
import importlib
importlib.reload(sm)
importlib.reload(ld)

import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import xgboost as xgb
xgb.set_config(verbosity=0) 

import sys
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_columns', None)


In [2]:
datasets = """Bed002 Bed003 Bed004 Bed005 Bed006 Bed008 Bed009 Bed010 Bed011 Bed012 Bed013 Bed014 Bed015 Bed016 Bed017 Bmr001 Bmr002 Bmr005 Bmr007 Bmr009 Bmr010 Bmr011 Bmr012 Bmr013 Bmr014 Bmr018 Bmr019 Bmr021 Bmr022 Bmr024 Bmr025 Bmr026 Bmr027 Bmr029 Bns001 Bns002""".split(" ")
results_merged_path = "../results_merged_fixedf0/"

X_train, y_train, X_test, y_test = ld.train_test_split(datasets,results_merged_path,0.3)

all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']

X_train = X_train[features_selected]
X_test = X_test[features_selected]

## Test on commonly used models

In [3]:
def print_eval(y_pred,y_true):
    k = int(max(1,np.floor((len(y_true)+1)/(2*(sum(y_true)+1)))))
    print('k =',k)

    int_y_pred = (np.array(y_pred))
    int_y_true = (np.array(y_true))

    print('- windiff:',sm.get_windiff(int_y_true,int_y_pred,k))
    print('- pk:',sm.get_pk(int_y_true,int_y_pred,k))
    print('- kkappa:',sm.get_k_kappa(int_y_true,int_y_pred,k))

### Decision Tree classifier

In [4]:
def DecTree(X_train, X_test, y_train):
    
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)

    # # get importance
    # importance = clf.feature_importances_
    # # summarize feature importance
    # for i,v in enumerate(importance):
    #     print('Feature:',features_selected[i],'->',v)
    # # plot feature importance
    # plt.bar(features_selected, importance)
    # plt.title('Feature Importance')
    # plt.show()

    # # plot tree
    # plt.figure(figsize=(15,10))  # set plot size (denoted in inches)
    # tree.plot_tree(clf, max_depth=3, fontsize=10,feature_names=features_selected)
    # plt.show()
    
    return y_predicted

In [18]:
DT_y_predicted = DecTree(X_train,X_test,y_train)
print(sum(DT_y_predicted))
print_eval(DT_y_predicted,y_test)

137.0
k = 54
- windiff: 0.5154753643303261
- pk: 0.4587786259541985
- kkappa: 0.05070403514991029


### XGB Classifier for binary class

In [6]:
def XGB_class(X_train, X_test, y_train):
    
    clf = xgb.XGBClassifier(seed = 24, use_label_encoder =False)
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)
    
    return y_predicted

In [19]:
XGBc_y_predicted = XGB_class(X_train,X_test,y_train)
print(sum(XGBc_y_predicted))
print_eval(XGBc_y_predicted,y_test)



3
k = 54
- windiff: 0.4111727966689799
- pk: 0.4074947952810548
- kkappa: 0.005407708984989378


### SVM for binary class

In [8]:
def SVMc(X_train,X_test,y_train):
    clf = SVC(kernel='linear',probability=True) 
    clf.fit(X_train, y_train) 
    y_predicted = clf.predict(X_test)
    
    return y_predicted

In [20]:
SVM_y_predicted = SVMc(X_train,X_test,y_train)
print(sum(SVM_y_predicted))
print_eval(SVM_y_predicted,y_test)

0.0
k = 54
- windiff: 0.4076335877862595
- pk: 0.4076335877862595
- kkappa: 0.0


### Random Forest classifier for binary class

In [10]:
def RFc(X_train,X_test,y_train):
    model = RandomForestClassifier().fit(X_train,y_train)
    y_predicted = model.predict(X_test)
    
    return y_predicted

In [21]:
RFc_y_predicted = RFc(X_train,X_test,y_train)
print(sum(RFc_y_predicted))
print_eval(RFc_y_predicted,y_test)

2.0
k = 54
- windiff: 0.41512838306731437
- pk: 0.41512838306731437
- kkappa: -0.014938850491822761


### Logistic Regressor for binary class

In [12]:
def LR(X_train,X_test,y_train):
    model = LogisticRegression(random_state=0).fit(X_train,y_train)
    y_predicted = model.predict(X_test)
    
    return y_predicted

In [22]:
LR_y_predicted = LR(X_train,X_test,y_train)
print(sum(LR_y_predicted))
print_eval(LR_y_predicted,y_test)

0.0
k = 54
- windiff: 0.4076335877862595
- pk: 0.4076335877862595
- kkappa: 0.0


### Naive Bayes for binary class

In [39]:
def naiveBayes(X_train,X_test,y_train):
    param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
    }
    nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
    nbModel_grid.fit(X_train, y_train)

    best_var_smoothing = nbModel_grid.best_estimator_.get_params()['var_smoothing']
    print('Best var_smoothing:', best_var_smoothing)

    clf = GaussianNB(var_smoothing=best_var_smoothing) 
    clf.fit(X_train, y_train) 
    y_predicted = clf.predict(X_test)
    
    return y_predicted

In [40]:
NB_y_predicted = naiveBayes(X_train,X_test,y_train)
print(sum(NB_y_predicted))
print_eval(NB_y_predicted,y_test)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
45.0
k = 54
- windiff: 0.44802220680083277
- pk: 0.4178348369188064
- kkappa: 0.02655901544051302


## Multiple runs of each algo with different splits

In [14]:
datasets = """Bed002 Bed003 Bed004 Bed005 Bed006 Bed008 Bed009 Bed010 Bed011 Bed012 Bed013 Bed014 Bed015 Bed016 Bed017 Bmr001 Bmr002 Bmr005 Bmr007 Bmr009 Bmr010 Bmr011 Bmr012 Bmr013 Bmr014 Bmr018 Bmr019 Bmr021 Bmr022 Bmr024 Bmr025 Bmr026 Bmr027 Bmr029 Bns001 Bns002""".split(" ")
results_merged_path = "../results_merged_fixedf0/"

def split_data(split,features_selected):
    X_train, y_train, X_test, y_test = ld.train_test_split(datasets,results_merged_path,split)

    def filter(data):
        data['similarity'] = data['similarity'][2:-2]
        data['similarity'] = pd.to_numeric(data['similarity'])

        data.fillna(0,inplace=True)

        data = data[features_selected]
        
        return data

    X_train = filter(X_train)
    X_test = filter(X_test)

    return X_train, y_train, X_test, y_test

In [15]:
def get_eval(y_pred,y_true):
    k = int(max(1,np.floor((len(y_true)+1)/(2*(sum(y_true)+1)))))
    return sm.get_windiff(np.array(y_true),np.array(y_pred),k),sm.get_pk(np.array(y_true),np.array(y_pred),k),sm.get_k_kappa(np.array(y_true),np.array(y_pred),k)

def get_avg_eval(eval):
    print('- windiff:',sum([row[0] for row in eval])/len([row[0] for row in eval]))
    print('- pk:',sum([row[1] for row in eval])/len([row[1] for row in eval]))
    print('- k-kappa:',sum([row[2] for row in eval])/len([row[2] for row in eval]))

In [16]:
iterations = 10

all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
split = 0.3

DT_y_predicted=[]
XGBc_y_predicted=[]
SVM_y_predicted=[]
RFc_y_predicted=[]
LR_y_predicted=[]
for i in range(iterations):
    X_train, y_train, X_test, y_test = split_data(split,features_selected)
    
    DT_y_predicted.append(get_eval(DecTree(X_train,X_test,y_train),y_test))
    RFc_y_predicted.append(get_eval(RFc(X_train,X_test,y_train),y_test))
    XGBc_y_predicted.append(get_eval(XGB_class(X_train,X_test,y_train),y_test))
    SVM_y_predicted.append(get_eval(SVMc(X_train,X_test,y_train),y_test))
    LR_y_predicted.append(get_eval(LR(X_train,X_test,y_train),y_test))



In [17]:
print('-> Parameters: nbr iterations =',iterations,', split =',split,', features =',features_selected)
print('----- DT:')
get_avg_eval(DT_y_predicted)
print('----- RF:')
get_avg_eval(RFc_y_predicted)
print('----- XGB:')
get_avg_eval(XGBc_y_predicted)
print('----- SVM:')
get_avg_eval(SVM_y_predicted)
print('----- LR:')
get_avg_eval(LR_y_predicted)

-> Parameters: nbr iterations = 1 , split = 0.3 , features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
----- DT:
- windiff: 0.5326856349757113
- pk: 0.47300485773768214
- k-kappa: 0.022641456563010094
----- RF:
- windiff: 0.41512838306731437
- pk: 0.41512838306731437
- k-kappa: -0.014938850491822761
----- XGB:
- windiff: 0.4111727966689799
- pk: 0.4074947952810548
- k-kappa: 0.005407708984989378
----- SVM:
- windiff: 0.4076335877862595
- pk: 0.4076335877862595
- k-kappa: 0.0
----- LR:
- windiff: 0.4076335877862595
- pk: 0.4076335877862595
- k-kappa: 0.0
