In [168]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [50]:
data_dir = "/Users/soniamannan/Documents/DATA401/capstone/digitaldemocracy_ds_capstone_2018/"
target_col = 'utterance_transition_values'

In [51]:
training_output_filename = data_dir + "training/" + "training_utterances.csv"

In [118]:
# split dataset evenly based on labels
def split_test_train(total, stratify_col):
    transition_rows = total[total[stratify_col] != 0]
    non_transition_rows = total[total[stratify_col] == 0]
    
    # first split transitions into training/testing
    X_train1, X_test1, y_train1, y_test1 = train_test_split(transition_rows, 
                                                    transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # assert there are only transition labels in this dataframe
    assert len(X_train1[X_train1[target_col] == 0]) == 0
    assert len(X_test1[X_test1[target_col] == 0]) == 0
    
    train_len = len(X_train1) # number of non-transitions to add to training set
    test_len = len(X_test1) # number of non-transitions to add to testing set
    
    
    # next split non-transitions into training/testing
    X_train2, X_test2, y_train2, y_test2 = train_test_split(non_transition_rows, 
                                                    non_transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # pick train_len random rows from non-transition training set
    X_train2 = X_train2.sample(n = train_len, axis=0)
    
    # pick test_len random rows from non_transitions testing set
    X_test2 = X_test2.sample(n = test_len, axis=0)
    
    # assert there are no transition utterances in non-transition training and testing set
    assert len(X_train2[X_train2[target_col] != 0]) == 0
    assert len(X_test2[X_test2[target_col] != 0]) == 0
    
    # final result, concat the dataframe
    X_train_final = pd.concat([X_train1, X_train2])
    X_test_final = pd.concat([X_test1, X_test2])
    
    return X_train_final['text'], X_test_final['text'], X_train_final[target_col], X_test_final[target_col]
    

In [175]:
train = pd.read_table(training_output_filename, sep="~")[['text', target_col]]

In [176]:
train.head()

Unnamed: 0,text,utterance_transition_values
0,We don't have a quorum yet I don't believe.,1
1,We don't have a quorum yet.,0
2,We'll ask the sergeants to please call the mem...,0
3,that we can establish a quorum for this partic...,0
4,This is the Assembly's 2nd Extraordinary Sessi...,0


In [177]:
x_train, x_test, y_train, y_test = split_test_train(train, target_col)

In [178]:
transition_rows = train[train[target_col] != 0]

### Assert training and testing splits are the correct dimensions
### After splitting, training and testing sets should each have 50% transitions and 50% non-transitions
### training dimensions should be 2 * 70% of the number of transitions in the data set
### testing dimensions should be 2 * 30% of the number of transitions in the data set

In [179]:
assert len(x_train) == len(y_train)

In [180]:
assert len(x_test) == len(y_test)

In [181]:
assert len(x_train) == int(len(transition_rows) * 0.7) * 2

In [182]:
assert len(x_test) == (len(transition_rows) * 2) - (int(len(transition_rows) * 0.7) * 2)

In [183]:
assert len(y_train[y_train == 0]) == len(y_train[y_train != 0])

In [184]:
assert len(y_test[y_test == 0]) == len(y_test[y_test != 0])

### Vectorize utterances with bag of words features

In [195]:
count_vect = CountVectorizer()
count_vect.fit(np.hstack((x_train, x_test)))
X_train_counts = count_vect.transform(x_train)
X_test_counts = count_vect.transform(x_test)

In [198]:
assert X_train_counts.shape[1] == X_test_counts.shape[1]

### Pass vectorized utterances into a Naive Bayes model

In [199]:
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Output accuracy on testing set

In [200]:
assert X_test_counts.shape[0] == y_test.shape[0]

In [201]:
clf.score(X_test_counts, y_test, sample_weight=None)

0.74196428571428574