In [1]:
import model.load_data as ld
import model.scoring_metrics as sm
import importlib
importlib.reload(sm)
importlib.reload(ld)

import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

import sys
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_columns', None)

In [2]:
datasets = """Bed002 Bed003 Bed004 Bed005 Bed006 Bed008 Bed009 Bed010 Bed011 Bed012 Bed013 Bed014 Bed015 Bed016 Bed017 Bmr001 Bmr002 Bmr005 Bmr007 Bmr009 Bmr010 Bmr011 Bmr012 Bmr013 Bmr014 Bmr018 Bmr019 Bmr021 Bmr022 Bmr024 Bmr025 Bmr026 Bmr027 Bmr029 Bns001 Bns002""".split(" ")
results_merged_path = "../results_merged_fixedf0/"

X_train, y_train, X_test, y_test = ld.train_test_split(datasets,results_merged_path,0.3)

X = pd.concat([X_train,X_test])
y = pd.concat([y_train,y_test])

all_features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']
features_selected = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']

X_train = X_train[features_selected]
X_test = X_test[features_selected]

# def count_duplicate(df1,df2,feature):
#     duplicate_count = 0
#     for index, row in df2.iterrows():
#         if df1.iloc[index][feature]==df2.iloc[index][feature]:
#             duplicate_count+=1
#     return duplicate_count

# for feature in features_selected:
#     print(count_duplicate(X_train,X_test,feature))


In [3]:
def print_eval(y_pred,y_true):
    k = int(max(1,np.floor((len(y_true)+1)/(2*(sum(y_true)+1)))))
    print('k =',k)

    int_y_pred = (np.array(y_pred))
    int_y_true = (np.array(y_true))

    print('- windiff:',sm.get_windiff(int_y_true,int_y_pred,k))
    print('- pk:',sm.get_pk(int_y_true,int_y_pred,k))
    print('- kkappa:',sm.get_k_kappa(int_y_true,int_y_pred,k))

In [10]:
def DecTree(X_train, X_test, y_train,tuning, best_criterion=None,best_max_depth=None,best_min_sample_leaf=None):
    if tuning:
        clf = DecisionTreeClassifier(criterion=best_criterion,max_depth=best_max_depth,min_samples_leaf=best_min_sample_leaf)
    else:
        clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)

    return y_predicted

In [12]:
def DecTree_hyperparam(X,y):
    
    clf = DecisionTreeClassifier()
    std_slc = StandardScaler()
    pipe = Pipeline(steps=[('std_slc', std_slc),
                        ('dec_tree', clf)])

    n_components = list(range(1,X.shape[1]+1,1))

    criterion = ['gini', 'entropy','log_loss']
    max_depth = [2,5,10,15,20,30,50,100]
    min_samples_leaf=[5,10,20,50,100]

    parameters = dict(dec_tree__criterion=criterion,
                      dec_tree__max_depth=max_depth,
                      dec_tree__min_samples_leaf=min_samples_leaf)

    clf_GS = GridSearchCV(pipe, parameters)
    clf_GS.fit(X, y)

    best_criterion = clf_GS.best_estimator_.get_params()['dec_tree__criterion']
    best_max_depth = clf_GS.best_estimator_.get_params()['dec_tree__max_depth']
    best_min_samples_leaf = clf_GS.best_estimator_.get_params()['dec_tree__min_samples_leaf']

    print('Best criterion:', best_criterion)
    print('Best max_depth:', best_max_depth)
    print('Best min_sample_leaf:', best_min_samples_leaf)

    return best_criterion,best_max_depth,best_min_samples_leaf

In [13]:
best_criterion,best_max_depth,best_min_sample_leaf = DecTree_hyperparam(X_train,y_train)

Best criterion: gini
Best max_depth: 2
Best min_sample_leaf: 10


In [14]:
print("-------------Normal DST")
DT_y_predicted = DecTree(X_train,X_test,y_train,False)
print(sum(DT_y_predicted))
print_eval(DT_y_predicted,y_test)

print("-------------Tuned DST")
DT_y_predicted = DecTree(X_train,X_test,y_train,True,best_criterion,best_max_depth,best_min_sample_leaf)
print(sum(DT_y_predicted))
print_eval(DT_y_predicted,y_test)

-------------Normal DST
147.0
k = 61
- windiff: 0.5313353358820316
- pk: 0.448798470780994
- kkappa: 0.07130356481851277
-------------Tuned DST
0.0
k = 61
- windiff: 0.3718596395412343
- pk: 0.3718596395412343
- kkappa: 0.0


In [1]:
from model_trainer_and_tester import read_in_dataset, test_set_evaluate_multiple

# Pick the features that you want, and vary them as needed
features = ['pause', 'speakerChange', 'similarity', 'f0_diff', 'f0_baseline_diff']

# Represents the context that is being used for training and for evaluation
shifts = [-2, -1, 1, 2]

X_train, y_train = read_in_dataset(features, shifts, to_read='train')

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

results = test_set_evaluate_multiple(clf, features, shifts)

In [3]:
results.mean()

Pk         0.570373
K-k       -0.066525
Windiff    0.570373
dtype: float64