# Test basic classification models for topic segmentation

In [1]:
import model.load_data as ld
import model.scoring_metrics as sm

from sklearn import metrics
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import poisson
from sklearn.svm import SVC 
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import xgboost as xgb
xgb.set_config(verbosity=0)
import matplotlib
from sklearn.model_selection import train_test_split 
pd.set_option('display.max_columns', None)
import sys
np.set_printoptions(threshold=sys.maxsize)

In [2]:
datasets = """Bed002 Bed003 Bed004 Bed005 Bed006 Bed008 Bed009 Bed010 Bed011 Bed012 Bed013 Bed014 Bed015 Bed016 Bed017 Bmr001 Bmr002 Bmr005 Bmr007 Bmr009 Bmr010 Bmr011 Bmr012 Bmr013 Bmr014 Bmr018 Bmr019 Bmr021 Bmr022 Bmr024 Bmr025 Bmr026 Bmr027 Bmr029 Bns001 Bns002""".split(" ")
results_merged_path = "../results_merged_fixedf0/"

X_train, y_train, X_test, y_test = ld.train_test_split(datasets,results_merged_path,0.3)

all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']

def filter(data):
    data['similarity'] = data['similarity'][2:-2]
    data['similarity'] = pd.to_numeric(data['similarity'])

    data.fillna(0,inplace=True)

    data = data[features_selected]
    
    return data

X_train = filter(X_train)
X_test = filter(X_test)

## Test on commonly used models

In [3]:
def print_eval(y_pred,y_true,k):
    int_y_pred = (np.array(y_pred))
    int_y_true = (np.array(y_true))

    print('- windiff:',sm.get_windiff(int_y_pred,int_y_true,k))
    print('- pk:',sm.get_pk(int_y_pred,int_y_true,k))
    print('- kkappa:',sm.get_k_kappa(int_y_pred,int_y_true,k))

### Decision Tree classifier

In [18]:
def DecTree(X_train, X_test, y_train, y_test):
    
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)
    # tree.plot_tree(clf)
    
    return y_predicted

In [22]:
DT_y_predicted = DecTree(X_train,X_test,y_train,y_test)
k = int(max(1,np.floor((len(DT_y_predicted)+1)/(2*(sum(DT_y_predicted)+1)))))
print('k =',k)
print_eval(DT_y_predicted,y_test,k)

k = 59
- windiff: 0.05744176865377023
- pk: 0.049019607843137254
- kkappa: 0.8947321572860929


### XGB Classifier for binary class

In [6]:
def XGB_class(X_train, X_test, y_train, y_test):
    
    clf = xgb.XGBClassifier(seed = 24, use_label_encoder =False)
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)
    
    return clf.predict(X_test)

In [7]:
XGBc_y_predicted = XGB_class(X_train,X_test,y_train,y_test)
k = int(max(1,np.floor((len(XGBc_y_predicted)+1)/(2*(sum(DT_y_predicted)+1)))))
print('k =',k)
print_eval(XGBc_y_predicted,y_test,k)



k = 58
- windiff: 0.2861040732800624
- pk: 0.2546612096407458
- kkappa: 0.41230707556624585


### SVM for binary class

In [8]:
def SVMc(X_train,X_test,y_train,y_test):
    clf = SVC(kernel='linear',probability=True) 
    clf.fit(X_train, y_train) 
    y_predicted = clf.predict(X_test)
    
    return y_predicted

In [9]:
SVM_y_predicted = SVMc(X_train,X_test,y_train,y_test)
k = int(max(1,np.floor((len(SVM_y_predicted)+1)/(2*(sum(DT_y_predicted)+1)))))
print('k =',k)
print_eval(SVM_y_predicted,y_test,k)

k = 58
- windiff: 0.4047294224647567
- pk: 0.4047294224647567
- kkappa: 0.0


### Random Forest classifier for binary class

In [10]:
def RFc(X_train,X_test,y_train,y_test):
    model = RandomForestClassifier().fit(X_train,y_train)
    y_predicted = model.predict(X_test)
    
    return y_predicted

In [11]:
RFc_y_predicted = RFc(X_train,X_test,y_train,y_test)
k = int(max(1,np.floor((len(RFc_y_predicted)+1)/(2*(sum(DT_y_predicted)+1)))))
print('k =',k)
print_eval(RFc_y_predicted,y_test,k)

k = 58
- windiff: 0.03891379198336906
- pk: 0.03891379198336906
- kkappa: 0.9179782079497437


### Logistic Regressor for binary class

In [12]:
def LR(X_train,X_test,y_train,y_test):
    model = LogisticRegression(random_state=0).fit(X_train,y_train)
    y_predicted = model.predict(X_test)
    
    return y_predicted

In [13]:
LR_y_predicted = LR(X_train,X_test,y_train,y_test)
k = int(max(1,np.floor((len(LR_y_predicted)+1)/(2*(sum(DT_y_predicted)+1)))))
print('k =',k)
print_eval(LR_y_predicted,y_test,k)

k = 58
- windiff: 0.4047294224647567
- pk: 0.40154615734424737
- kkappa: 0.009894052547075002


## Multiple runs of each algo with different splits

In [14]:
datasets = """Bed002 Bed003 Bed004 Bed005 Bed006 Bed008 Bed009 Bed010 Bed011 Bed012 Bed013 Bed014 Bed015 Bed016 Bed017 Bmr001 Bmr002 Bmr005 Bmr007 Bmr009 Bmr010 Bmr011 Bmr012 Bmr013 Bmr014 Bmr018 Bmr019 Bmr021 Bmr022 Bmr024 Bmr025 Bmr026 Bmr027 Bmr029 Bns001 Bns002""".split(" ")
results_merged_path = "../results_merged_fixedf0/"

def split_data(split,features_selected):
    X_train, y_train, X_test, y_test = ld.train_test_split(datasets,results_merged_path,split)

    def filter(data):
        data['similarity'] = data['similarity'][2:-2]
        data['similarity'] = pd.to_numeric(data['similarity'])

        data.fillna(0,inplace=True)

        data = data[features_selected]
        
        return data

    X_train = filter(X_train)
    X_test = filter(X_test)

    return X_train, y_train, X_test, y_test

In [15]:
def get_eval(y_pred,y_true):
    return sm.get_windiff(np.array(y_pred),np.array(y_true),k),sm.get_pk(np.array(y_pred),np.array(y_true),k),sm.get_k_kappa(np.array(y_pred),np.array(y_true),k)

def get_avg_eval(eval):
    print('- windiff:',sum([row[0] for row in eval])/len([row[0] for row in eval]))
    print('- pk:',sum([row[1] for row in eval])/len([row[1] for row in eval]))
    print('- k-kappa:',sum([row[2] for row in eval])/len([row[2] for row in eval]))

In [24]:
iterations = 5

all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity','f0_diff', 'f0_baseline_diff']
split = 0.3

DT_y_predicted=[]
XGBc_y_predicted=[]
SVM_y_predicted=[]
RFc_y_predicted=[]
LR_y_predicted=[]
for i in range(iterations):
    X_train, y_train, X_test, y_test = split_data(split,features_selected)
    
    DT_y_predicted.append(get_eval(DecTree(X_train,X_test,y_train,y_test),y_test))
    RFc_y_predicted.append(get_eval(RFc(X_train,X_test,y_train,y_test),y_test))
    XGBc_y_predicted.append(get_eval(XGB_class(X_train,X_test,y_train,y_test),y_test))
    SVM_y_predicted.append(get_eval(SVMc(X_train,X_test,y_train,y_test),y_test))
    LR_y_predicted.append(get_eval(LR(X_train,X_test,y_train,y_test),y_test))



In [26]:
print('-> Parameters: nbr iterations =',iterations,', split =',split,', features =',features_selected)
print('----- DT:')
get_avg_eval(DT_y_predicted)
print('----- RF:')
get_avg_eval(RFc_y_predicted)
print('----- XGB:')
get_avg_eval(XGBc_y_predicted)
print('----- SVM:')
get_avg_eval(SVM_y_predicted)
print('----- LR:')
get_avg_eval(LR_y_predicted)

-> Parameters: nbr iterations = 5 , split = 0.3 , features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
----- DT:
- windiff: 0.04338976093875578
- pk: 0.03844717220919065
- k-kappa: 0.9185155891895278
----- RF:
- windiff: 0.037573016925369115
- pk: 0.036076733770856864
- k-kappa: 0.9217906162539723
----- XGB:
- windiff: 0.2851183481857893
- pk: 0.24518739658170477
- k-kappa: 0.42934124093649945
----- SVM:
- windiff: 0.39608136419331136
- pk: 0.39608136419331136
- k-kappa: 0.0
----- LR:
- windiff: 0.3953568497290764
- pk: 0.3937333499988463
- k-kappa: 0.008847898138169216
