In [1]:
%matplotlib inline
from swda import Transcript
from swda import CorpusReader
from sklearn.feature_extraction.text import CountVectorizer
import random
import pickle as pkl
import numpy as np
import re
import matplotlib.pyplot as plt
import csv
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import metrics
from nltk import FreqDist
from nltk import word_tokenize
corpus = CorpusReader('swda')

In [2]:
# read the tag description
tag_detail = {}
with open('./swda/tag_description.csv', 'r') as csv_file:
    tag_reader = csv.reader(csv_file)
    for row in tag_reader:
        tag_detail[row[1]] = (row[0], row[2])
        
tag_map = {}
with open('./swda/tag_mapping.csv', 'r') as csv_file:
    tag_reader = csv.reader(csv_file)
    for row in tag_reader:
        tag_map[row[1]] = row[2]

In [3]:
def plot_confusion_matrix(cm, tag_list, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(tag_list))
    plt.xticks(tick_marks, tag_list, rotation=45)
    plt.yticks(tick_marks, tag_list)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
def norm_tokens(tokens):
    result = []
    for t in tokens:
        if not ('[' in t or ']' in t or
               '{' in t or '}' in t or 
               '+' in t or '/' in t or 
               '--' in t or "#" in t):
            result.append(t)
    return result
    
def should_append(utt):
    if utt.damsl_act_tag() == '+':
        return True
    else:
        return False
    
def norm_label(utt):
    label = utt.damsl_act_tag()
    if (label.startswith("fo")):
        label = 'fo_o_fw_by_bc'
    #return tag_map[label]
    return label

def remove_tags(utt):
    # remove tags
    result = re.sub(r"<(.*?)>", "", utt)
    # remove multiple space
    result = re.sub(r" +", " ", result)
    result = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", result)
    result = result.strip()
    return result

def get_vocab(utts):
    vocab = []
    for utt in utts:
        vocab.extend([w for w in utt.split()])
    return set(vocab)

def get_words(utts):
    words = []
    for utt in utts:
        words.extend([w for w in utt.split()])
    return words

In [4]:
all_utts = []
all_targets = []
all_callers = []
tag_set = set()
cnt = 0;
last_idx = {'A':-1, 'B':-1}
black_list = []

for utt in corpus.iter_utterances(display_progress=False):
    tokens = utt.text_words()
    caller = utt.caller
    label = norm_label(utt)
    b_should_append = should_append(utt)
    
    # check for merging
    if b_should_append:
        idx = last_idx.get(caller)
        if idx >= 0:
            all_utts[idx] = all_utts[idx] + ' ' + ' '.join(norm_tokens(tokens))
            continue
        else:
            print "ERROR"
            break
    
    # check if empty
    norm_text = remove_tags(' '.join(norm_tokens(tokens)))
    if not norm_text:
        if label == 'x' and utt.text:
            norm_text = utt.text
        else:       
            continue
        
    # update previous speaker utt
    last_idx[caller] = cnt
    
    # save
    all_utts.append(norm_text)
    all_targets.append(label)
    all_callers.append(caller)
    tag_set.add(label)
    cnt += 1

In [93]:
all_yes = [utt for utt, tag in zip(all_utts, all_targets) if tag in ["aa", "ny" "na"] and "no" not in utt.lower()]
print len(all_yes)
all_no = [utt for utt, tag in zip(all_utts, all_targets) if tag in ["nn", "ar", "ng"]]
print len(all_no)
all_unknown = [utt for utt, tag in zip(all_utts, all_targets) if tag in ["aap_am", "no"]]
print len(all_unknown)
all_correct = ['correct', "you won", "yeah that's who I am thinking", "your guess is correct"]
print len(all_correct)
all_wrong = ["wrong", "incorrect", "your guess is wrong", "that's not who I am thinking"]
print len(all_wrong)

10062
2023
390
4
4


In [94]:
#all_yes = list(np.random.choice(all_yes, 100))
print len(all_yes)
#all_no = list(np.random.choice(all_no, 100))
print len(all_no)
#all_unknown = list(np.random.choice(all_unknown, 100))
print len(all_unknown)

10062
2023
390


In [95]:
# get vocabular size
vocab = get_vocab(all_yes).union(get_vocab(all_no).union(get_vocab(all_unknown).union(get_vocab(all_correct).union(get_vocab(all_wrong)))))
print len(vocab)
all_words = get_words(all_yes) + get_words(all_no) + get_words(all_unknown) + get_words(all_correct) + get_words(all_wrong)
vocab_dist = FreqDist(all_words)
vocab_dist.most_common(10)

1460


[(',', 7822),
 ('.', 6405),
 ('Yeah', 2872),
 ('I', 1222),
 ('right', 1024),
 ('No', 996),
 ('yeah', 972),
 ('Right', 915),
 ("That's", 871),
 ('true', 638)]

## Conduct Classificaiton to make sure uni/bigram is good enough

In [96]:
order_utts = all_yes + all_no + all_unknown + all_correct + all_wrong
order_labels = [0] * len(all_yes) + [1] * len(all_no) + [2] * len(all_unknown) + [3] * len(all_correct) + [4] * len(all_wrong)
random_idx = np.random.permutation(len(order_utts))

utts = [order_utts[idx] for idx in random_idx]
labels = np.array([order_labels[idx] for idx in random_idx])
print len(utts)
print len(labels)

12483
12483


In [97]:
bigram = CountVectorizer(min_df=0, ngram_range=(1,2), lowercase=False)
unigram = CountVectorizer(min_df=0, ngram_range=(1,1), lowercase=False)
train_utts = bigram.fit(utts)
unigram.fit(utts)
ngram_train = bigram.transform(utts)
print ngram_train.shape

(12483, 5415)


In [98]:
#clf = SVC(C=1.0, kernel='rbf', gamma=10.0).fit(ngram_train, labels)
clf = LinearSVC(C=1.0).fit(ngram_train, labels)
predicted = clf.predict(ngram_train)

In [99]:
print metrics.classification_report(labels, predicted)
print metrics.confusion_matrix(labels, predicted)
#plot_confusion_matrix(cm=metrics.confusion_matrix(labels, predicted), tag_list=['yes','no', 'unkown'])

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     10062
          1       0.98      0.99      0.98      2023
          2       0.99      0.93      0.96       390
          3       1.00      1.00      1.00         4
          4       1.00      1.00      1.00         4

avg / total       0.99      0.99      0.99     12483

[[10014    46     2     0     0]
 [   17  2004     2     0     0]
 [   26     3   361     0     0]
 [    0     0     0     4     0]
 [    0     0     0     0     4]]


In [105]:
cnt = 0
for idx in range(len(labels)):
    if not labels[idx] == predicted[idx]:
        print (utts[idx], labels[idx], predicted[idx])
        cnt += 1
print cnt

('Huh-uh .', 0, 1)
("I don't think so .", 0, 1)
('Huh-uh', 0, 1)
('Huh-uh .', 0, 1)
('I guess so  .', 2, 0)
('Huh-uh .', 0, 1)
("Um , that's probably true .", 2, 0)
('Huh-uh .', 0, 1)
('Well , yes ,', 1, 0)
("That's probably true .", 2, 0)
('I guess so .', 2, 0)
("that's probably true .", 2, 0)
('Huh-uh ,', 0, 1)
('Yes ,', 1, 0)
('they would .', 1, 0)
('uh-huh ,', 1, 0)
('Huh-uh .', 0, 1)
('Yeah ,', 2, 0)
('huh-uh .', 0, 1)
("That's probably right .", 2, 0)
("I don't know ,", 1, 2)
('huh-uh .', 0, 1)
("I don't ,", 2, 1)
('Huh-uh .', 0, 1)
('uh-huh ,', 1, 0)
('Could be .', 2, 0)
('Huh-uh ,', 0, 1)
("I don't think ,", 0, 1)
('I suppose .', 2, 0)
("I don't think so .", 0, 1)
('Um .', 2, 0)
('I think so .', 2, 0)
('I , I , I guess so .', 2, 0)
('Huh-uh .', 0, 1)
('huh-uh .', 0, 1)
('N- , -', 1, 0)
("I don't think so .", 2, 1)
('I know ,', 1, 2)
('Huh-uh .', 0, 1)
('Huh-uh .', 0, 1)
('Uh-huh .', 1, 0)
('Huh-uh .', 0, 1)
('huh-uh .', 0, 1)
('That could be .', 0, 2)
('I guess so .', 2, 0)
('H

## save the feature as pickle

In [101]:
bigram_yes = bigram.transform(all_yes)
bigram_no = bigram.transform(all_no)
bigram_unkown = bigram.transform(all_unknown)
bigram_correct = bigram.transform(all_correct)
bigram_wrong = bigram.transform(all_wrong)

unigram_yes = unigram.transform(all_yes)
unigram_no = unigram.transform(all_no)
unigram_unkown = unigram.transform(all_unknown)
unigram_correct = unigram.transform(all_correct)
unigram_wrong = unigram.transform(all_wrong)

In [102]:
pkl.dump({"yes":bigram_yes, "no": bigram_no, "unknown": bigram_unkown, "correct": bigram_correct, "wrong": bigram_wrong}, open('bigram_usr_resp.pkl', 'w'))

In [103]:
pkl.dump({"yes":unigram_yes, "no": unigram_no, "unknown": unigram_unkown, "correct": unigram_correct, "wrong": unigram_wrong}, open('unigram_usr_resp.pkl', 'w'))

In [104]:
pkl.dump({"yes":all_yes, "no": all_no, "unknown": all_unknown, "correct": all_correct, "wrong": all_wrong}, open('utts.pkl', 'w'))