In [1]:
import numpy as np
import pandas as pd

from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
data_dir = "/Users/soniamannan/Documents/DATA401/capstone/DigitalDemocracyCapstone/data/"
target_col = 'transition_value'

In [3]:
training_output_filename = data_dir + "training/training_utterances_binary.csv"

In [4]:
training_output_binary_filename = data_dir + "training/training_utterances_binary.csv"

In [5]:
# split dataset evenly based on labels
def split_test_train(total, stratify_col):
    transition_rows = total[total[stratify_col] != 0]
    non_transition_rows = total[total[stratify_col] == 0]
    
    # first split transitions into training/testing
    X_train1, X_test1, y_train1, y_test1 = train_test_split(transition_rows, 
                                                    transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # assert there are only transition labels in this dataframe
    assert len(X_train1[X_train1[target_col] == 0]) == 0
    assert len(X_test1[X_test1[target_col] == 0]) == 0
    
    train_len = len(X_train1) # number of non-transitions to add to training set
    test_len = len(X_test1) # number of non-transitions to add to testing set
    
    
    # next split non-transitions into training/testing
    X_train2, X_test2, y_train2, y_test2 = train_test_split(non_transition_rows, 
                                                    non_transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # pick train_len random rows from non-transition training set
    X_train2 = X_train2.sample(n = train_len, axis=0)
    
    # pick test_len random rows from non_transitions testing set
    X_test2 = X_test2.sample(n = test_len, axis=0)
    
    # assert there are no transition utterances in non-transition training and testing set
    assert len(X_train2[X_train2[target_col] != 0]) == 0
    assert len(X_test2[X_test2[target_col] != 0]) == 0
    
    # final result, concat the dataframe
    X_train_final = pd.concat([X_train1, X_train2])
    X_test_final = pd.concat([X_test1, X_test2])
    
    return X_train_final['text'], X_test_final['text'], X_train_final[target_col], X_test_final[target_col]
    

In [44]:
# assert training/testing split is balanced
def verify_train_test_split(train, x_train, y_train, x_test, y_test):
    transition_rows = train[train[target_col] != 0]
    assert len(x_train) == len(y_train)
    assert len(x_test) == len(y_test)
    assert len(x_train) == int(len(transition_rows) * 0.7) * 2
    assert len(x_test) == (len(transition_rows) * 2) - (int(len(transition_rows) * 0.7) * 2)
    assert len(y_train[y_train == 0]) == len(y_train[y_train != 0])
    assert len(y_test[y_test == 0]) == len(y_test[y_test != 0])
    print ("{0}% of utterances are transitions".format((sum(y_train) / len(x_train)) * 100))

In [6]:
# extract bag of words features from text for a model
def bag_of_words_features(x_train, x_test):
    count_vect = CountVectorizer()
    count_vect.fit(np.hstack((x_train)))
    X_train_counts = count_vect.transform(x_train)
    X_test_counts = count_vect.transform(x_test)
    
    assert X_train_counts.shape[1] == X_test_counts.shape[1]
    
    return X_train_counts, X_test_counts

In [7]:
# output accuracy for a naive bayes model
# return the trained model
def model(X_train_counts, X_test_counts, y_train, y_test):
    clf = MultinomialNB()
    clf.fit(X_train_counts, y_train)
    
    assert X_test_counts.shape[0] == y_test.shape[0]
    
    acc = clf.score(X_test_counts, y_test, sample_weight=None)
    print("Model accuracy {0}".format(acc))
    
    return clf

In [8]:
def remove_transcript(train, n):
    total = len(np.unique(train['video_id']))
    ids = np.unique(train['video_id'])[:n]
    rows = train[train['video_id'].isin(ids)]
    train = train[~(train['video_id'].isin(ids))]
    
    assert len(np.unique(rows['video_id'])) == n
    assert len(np.unique(train['video_id'])) == total - n
    
    return train, rows

In [152]:
# adds the prefix POST to all utterances n after
# adds the prefix PRE to all utterances n before
# a transition phrase
def add_context(n):
    n_range = pd.read_csv(training_output_binary_filename, sep="~")
    
    transition_text = n_range['text']
    labels = n_range['transition_value']
    
    new_transition_labels = []
    new_transition_text = []

    length = len(n_range)
    
    for i in range(length):
        # get the phrases in the window
        text = ''
        for x in range(-n, n+1):
            # window is within range of the dataframe
            if (i + x >= 0 and i + x < length):
                if (x > 0):
                    text += ' '.join(["POST-" + x for x in transition_text[i+x].split()])
                if (x < 0):
                    text += ' '.join(["PRE-" + x for x in transition_text[i+x].split()])
                else:
                    text += ' ' + transition_text[i+x] + ' '
                    
        new_transition_text.append(text)
    
    print ("Number of new phrases {0}".format(len(new_transition_text)))
    print ("Number of labels {0}".format(len(n_range['transition_value'])))
    
    return pd.DataFrame({'text':new_transition_text,'transition_value':n_range['transition_value']}, 
     columns=['text', target_col])

### Read in data

In [10]:
train = pd.read_table(training_output_filename, sep="~")[['text', target_col, 'video_id']]

In [11]:
print("Number of transitions in the dataset {0}".format(len(train[train['transition_value'] != 0])))

Number of transitions in the dataset 2869


In [12]:
train.head()

Unnamed: 0,text,transition_value,video_id
0,please call the roll our anderson no,0,4161
1,nobel by very he'll know block i,0,4161
2,can my daily all bipolar no games.,0,4161
3,"no, ...only ...",0,4161
4,know paul.,0,4161


### Remove top 5 videos from dataset

In [11]:
train, transcripts = remove_transcript(train, 5)

In [12]:
print("Number of transitions in dataset after removing top 5 transcripts {0}"
.format(len(train[train['transition_value'] != 0])))

Number of transitions in dataset after removing top 5 transcripts 2857


### Split into training and testing sets

In [13]:
x_train, x_test, y_train, y_test = split_test_train(train[['text', target_col]], target_col)

In [14]:
transition_rows = train[train[target_col] != 0]

### Assert training and testing splits are the correct dimensions
### After splitting, training and testing sets should each have 50% transitions and 50% non-transitions
### training dimensions should be 2 * 70% of the number of transitions in the data set
### testing dimensions should be 2 * 30% of the number of transitions in the data set

In [15]:
assert len(x_train) == len(y_train)

In [16]:
assert len(x_test) == len(y_test)

In [17]:
assert len(x_train) == int(len(transition_rows) * 0.7) * 2

In [18]:
assert len(x_test) == (len(transition_rows) * 2) - (int(len(transition_rows) * 0.7) * 2)

In [19]:
assert len(y_train[y_train == 0]) == len(y_train[y_train != 0])

In [20]:
assert len(y_test[y_test == 0]) == len(y_test[y_test != 0])

In [21]:
print ("{0}% of utterances are transitions".format((sum(y_train) / len(x_train)) * 100))

50.0% of utterances are transitions


In [22]:
x_train.head()

80328     We have a quorum, like to ask our guests and v...
85190                 [UNKNOWN], our first hearing of 2016.
204322                                    No. >> No, Bates?
141781                                       Aye,Hernandez?
126973                   Motions, resolutions, and notices.
Name: text, dtype: object

### Vectorize utterances with bag of words features

### Pass vectorized utterances into a Naive Bayes model

### Output accuracy on testing set

In [23]:
X_train_counts, X_test_counts = bag_of_words_features(x_train, x_test)
bag_of_words_model = model(X_train_counts, X_test_counts, y_train, y_test)

Model accuracy 0.5477855477855478


In [24]:
def compare_predicted_to_actual(clf, X_test_counts, x_test, y_test, outfilename):
    # get predicted values
    preds = clf.predict(X_test_counts)
    
    print("% predictions that were 1's {0}\n".format(sum(preds) / len(preds)))
    
    # add predicted values to original dataframe
    total = pd.concat([x_test, y_test], axis=1)
    total['predicted'] = preds
    
    # get the incorrect predictions and write to a csv
    wrongs = total[total['transition_value'] != total['predicted']]
    wrongs.to_csv(outfilename)
    
    print ("Example of an incorrect transition\n")
    print (list(wrongs['text'])[0])
    print ("Actual {0}".format(list(wrongs['transition_value'])[0]))
    print ("Predicted {0}".format(list(wrongs['predicted'])[0]))
    
    return wrongs

### Look at what the wrong predictions actually are

In [25]:
wrongs = compare_predicted_to_actual(bag_of_words_model, X_test_counts, 
 x_test, y_test, data_dir+'predictions/wrong_predictions.csv')

% predictions that were 1's 0.5442890442890443

Example of an incorrect transition

I ask for an aye vote on this resolution.
Actual 1
Predicted 0


### Vectorize utterances with tf-idf

In [26]:
def transform_tfidf(x_train, x_test):
    X_train_counts, X_test_counts = bag_of_words_features(x_train, x_test)
    
    transformer = TfidfTransformer(smooth_idf=True)
    Xtrain_tfidf = transformer.fit_transform(X_train_counts)
    Xtest_tfidf = transformer.fit_transform(X_test_counts)
    
    assert Xtrain_tfidf.shape[1] == Xtest_tfidf.shape[1]
    
    return Xtrain_tfidf, Xtest_tfidf

In [27]:
Xtrain_tfidf, Xtest_tfidf = transform_tfidf(x_train, x_test)

In [28]:
tf_idf_model = model(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)

Model accuracy 0.539044289044289


### Vectorize utterances with n-gram features
### Best accuracy when combining unigram and bigrams

In [157]:
def transform_ngram(start, stop, x_train, x_test):
    ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(start, stop))
    counts = ngram_vectorizer.fit(np.hstack((x_train)))
    
    print ("Number of transformed features {0}\n"
     .format(len(ngram_vectorizer.get_feature_names())))
    
    print ("First 10 features\n{0}"
     .format('\n'.join(ngram_vectorizer.get_feature_names()[-10:])))
    
    X_train_counts = counts.transform(x_train)
    X_test_counts = counts.transform(x_test)
    
    assert X_train_counts.shape[1] == X_test_counts.shape[1]
    
    return X_train_counts, X_test_counts

In [30]:
X_train_ngram_counts, X_test_ngram_counts = transform_ngram(1, 2, x_train, x_test)

Number of transformed features 26940

First 10 features
zero for
zero measure
zero please
zero the
zero there
zero vote
zevs
zevs vehicle
zone
zone even


In [31]:
ngram_model = model(X_train_ngram_counts, X_test_ngram_counts, y_train, y_test)

Model accuracy 0.5402097902097902


### For utterances in a transcript, tag what the model predicts the utterance to be

In [42]:
def predict_entire_transcript(transcripts, x_train, x_test, y_train, y_test):
    print("{0}\n".format(transcripts.head()))
    
    count_vect = CountVectorizer()
    count_vect.fit(np.hstack((x_train)))
    transcripts_test = count_vect.transform(transcripts['text'])
    label = transcripts['transition_value']
    
    X_train_counts, X_test_counts = bag_of_words_features(x_train, x_test)
    bag_of_words_model = model(X_train_counts, X_test_counts, y_train, y_test)
    
    preds = bag_of_words_model.predict(transcripts_test)
    
    assert len(preds) == transcripts_test.shape[0]
    
    return preds

In [47]:
preds = predict_entire_transcript(transcripts, x_train, x_test, y_train, y_test)

                                   text  transition_value  video_id
0  Please call the roll our anderson no                 0      4161
1      Nobel by very he'll know block I                 0      4161
2    can my daily all bipolar no games.                 0      4161
3                       No, ...only ...                 0      4161
4                            Know paul.                 0      4161

Model accuracy 0.5477855477855478


In [50]:
res = transcripts.copy()
res['predicted'] = preds
res['actual'] = transcripts['transition_value']
res = res.drop(['transition_value'], axis=1)
res.head()

Unnamed: 0,text,video_id,predicted,actual
0,Please call the roll our anderson no,4161,1,0
1,Nobel by very he'll know block I,4161,1,0
2,can my daily all bipolar no games.,4161,1,0
3,"No, ...only ...",4161,0,0
4,Know paul.,4161,0,0


In [51]:
res.to_csv('/Users/soniamannan/Documents/DATA401/capstone/DigitalDemocracyCapstone/data/predictions/binary_predicted_transcript.csv')

### Add a context prefix to surrounding utterances
### Collapse the context (with prefix) and train on bag of words

In [160]:
n_range = add_context(5)

Number of new phrases 656444
Number of labels 656444


In [161]:
transitions = n_range[n_range['transition_value'] != 0]
non_transitions = n_range[n_range['transition_value'] == 0]

In [162]:
print ("Number of transition phrases {0}".format(len(transitions)))

Number of transition phrases 2869


In [163]:
print ("Total number of utterances {0}".format(len(collapsed_n_range)))

Total number of utterances 631713


In [164]:
print ("Example transition\n\n{0}".format(list(transitions['text'])[0]))

Example transition

PRE-...runner PRE-no PRE-snow.PRE-no, PRE-...act.PRE-no, PRE-why PRE-cassie PRE-i PRE-...PRE-...the PRE-apps PRE-members PRE-please PRE-now.PRE-...eyes PRE-twenty-four PRE-knows PRE-fourteen PRE-the PRE-measure PRE-passes PRE-we're PRE-going to start at the top of the file colleagues which is file item eighty eight. POST-senator POST-jackson. senator jackson. POST-file POST-out POST-of POST-eighty-eight POST-pass POST-on POST-file POST-item POST-eighty POST-nine. file out of eighty-eight pass on file item eighty nine. POST-so, POST-the POST-bill POST-three POST-thirteen POST-together POST-gianni POST-perform POST-file. so, the bill three thirteen together gianni perform file. POST-file POST-item POST-ninety POST-two POST-<BILL_ID> POST-for POST-three POST-four POST-center file item ninety two <BILL_ID> for three four center POST-allen. allen. 


In [165]:
print ("Example non-transitions\n\n{0}".format((list(non_transitions['text'])[10])))

Example non-transitions

PRE-by PRE-hancock PRE-...PRE-...i PRE-...where PRE-...hilton PRE-highway PRE-so.PRE-i PRE-half.PRE-no, PRE-jackson.PRE-lever PRE-by PRE-loop PRE-i'm PRE-acquire ...but i mean there's a time it's so high POST-...by POST-more POST-lock. ...by more lock. POST-no, POST-...you POST-know, POST-when? no, ...you know, when? POST-no, POST-nielsen POST-no POST-... no, nielsen no ... POST-i'd POST-have POST-lee. i'd have lee. POST-...runner POST-no POST-snow. ...runner no snow. 


### Make a new training/testing split

In [166]:
x_train_context, x_test_context, \
 y_train_context, y_test_context = split_test_train(n_range[['text', target_col]], target_col)

In [167]:
verify_train_test_split(n_range, x_train_context, y_train_context, x_test_context, y_test_context)

50.0% of utterances are transitions


In [168]:
X_train_counts, X_test_counts = bag_of_words_features(x_train_context, x_test_context)
bag_of_words_model = model(X_train_counts, X_test_counts, y_train_context, y_test_context)

Model accuracy 0.6225319396051103


In [169]:
X_train_ngram_counts, X_test_ngram_counts = transform_ngram(1, 2, x_train_context, x_test_context)

Number of transformed features 159334

First 10 features
zoning and
zoning our
zoning post
zoning pre
zoning standards
zoo
zoo as
zoo post
zoo pre
zoo they


In [170]:
ngram_model = model(X_train_ngram_counts, X_test_ngram_counts, y_train_context, y_test_context)

Model accuracy 0.6451800232288037


### Use WordNet features

In [52]:
# replace words in an utterance with their synset
def get_synset_from_text(utterance):
    for word in utterance.split():
        syn = wordnet.synsets(word)
        lemmas = set([s.lemmas()[0].name() for s in syn])
        if syn: utterance = utterance.replace(word, ' '.join(lemmas))
        
    return utterance

### Replace words with their synsets

In [139]:
x_train_word_net = [get_synset_from_text(x) for x in x_train]
x_test_word_net = [get_synset_from_text(x) for x in x_test]

In [140]:
x_train_word_net[0]

'alight ignite light idle easy unhorse Inner_Light abstemious lightly luminosity light_up lighter unaccented fall faint sparkle clean all_all_over over complete all_over over complete complete dark iniquity darkness and good all_all_over over complete all_over over complete complete corruption. '

In [141]:
x_test_word_net[0]

'Mister electric_chair moderate chair president professorship and members, George Thompson along, on along behalf of the California aesculapian medical checkup Association along, '

### Vectorize synsets with bag of words

In [142]:
count_vect = CountVectorizer()
count_vect.fit(np.hstack((x_train)))
X_train_counts = count_vect.transform(x_train)
X_test_counts = count_vect.transform(x_test)

In [143]:
assert X_train_counts.shape[1] == X_test_counts.shape[1]

### Train classifier

In [144]:
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Get accuracy

In [145]:
assert X_test_counts.shape[0] == y_test.shape[0]

In [146]:
clf.score(X_test_counts, y_test, sample_weight=None)

0.55986316989737739

### Sample utterance and synset

In [28]:
utterance = 'SB 1008 would extend the current CEQA exemption deadline for'
for word in utterance.split():
    print (word)
    syn = wordnet.synsets(word)
    lemmas = set([s.lemmas()[0].name() for s in syn])
    print (lemmas)
    print (syn)
    utterance = utterance.replace(word, ' '.join(lemmas))
print (utterance)

SB
{'Bachelor_of_Science', 'antimony'}
[Synset('antimony.n.01'), Synset('bachelor_of_science.n.01')]
1008
set()
[]
would
set()
[]
extend
{'strain', 'stretch', 'gallop', 'unfold', 'run', 'cover', 'offer', 'carry', 'prolong', 'exsert', 'extend', 'widen'}
[Synset('widen.v.04'), Synset('run.v.03'), Synset('cover.v.03'), Synset('extend.v.04'), Synset('exsert.v.01'), Synset('extend.v.06'), Synset('offer.v.05'), Synset('stretch.v.02'), Synset('extend.v.09'), Synset('prolong.v.01'), Synset('unfold.v.03'), Synset('gallop.v.03'), Synset('extend.v.13'), Synset('strain.v.03'), Synset('extend.v.15'), Synset('carry.v.09'), Synset('extend.v.17')]
the
set()
[]
current
{'current', 'stream'}
[Synset('current.n.01'), Synset('current.n.02'), Synset('stream.n.02'), Synset('current.a.01')]
CEQA
set()
[]
exemption
{'exemption'}
[Synset('exemption.n.01'), Synset('exemption.n.02'), Synset('exemption.n.03')]
deadline
{'deadline'}
[Synset('deadline.n.01')]
for
set()
[]
Bachelor_of_Science antimony   strain stret

In [97]:
syn = wordnet.synsets('carry')
lemmas = set([s.lemmas()[0].name() for s in syn])
print (lemmas)

{'impart', 'carry', 'stock', 'transport', 'post', 'have_a_bun_in_the_oven', 'behave', 'hold', 'dribble'}
