## Loading train and test

In [1]:
import numpy as np
import pandas as pd


Data: https://drive.google.com/drive/u/1/folders/1R64LrYygM5-97wNSGyx86tMy5YeswxJ7

In [2]:
train = pd.read_json('data\\runon_train_swift.json', orient='values', compression=None, typ='series')
test = pd.read_json('data\\runon_test_swift.json', orient='values', compression=None, typ='series')
print(len(train), len(test))

10000 2000


## Getting the features

In [3]:
train[:10]

0    [[did, False], [you, False], [see, False], [th...
1    [[resurrect, False], [the, False], [passed, Fa...
2    [[Hey, False], [guys, True], [thank, False], [...
3    [[It, False], ['s, False], [alright, False], [...
4    [[when, False], [my, False], [oldest, False], ...
5    [[Yes, False], [,, False], [in, False], [2007,...
6    [[http://www.hulu.com/tiger-and-bunny, False],...
7    [[i, False], [struggled, False], [,, False], [...
8    [[um, False], [..., False], [HOLY, False], [SH...
9    [[i, False], [ca, False], [n’t, False], [belie...
dtype: object

## Logistic regression classifier

In [4]:
import spacy
from spacy.tokens import Doc
nlp = spacy.load('en')

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

class FillingNans(object):
    '''
    Custom function for assembling into the pipeline object 
    '''
    def transform(self, X):
        XX = X.copy()
        XX[XX==np.nan] = 0
        return XX

    def fit(self, X, y=None):
        return self
    
def default_pipe_params():
    vec = DictVectorizer()   
    clf = LogisticRegression(solver='sag', max_iter=300, random_state=42, multi_class='multinomial')
    return [('vectorize', vec),
            ('fill_nans', FillingNans()),
    #                                  ('variance', variance),
            ('clf', clf)]   
     
    
def make_classifier(train, feature_analyzer, nlp, pipe_default = None):
    Fx, y = feature_analyzer(train, nlp)
    if pipe_default == None:
        pipe = Pipeline(default_pipe_params())
    else:
        pipe = pipe_default
    pipe.fit(Fx, y)
    return pipe

def classify(test, pipe, feature_analyzer, nlp):
    Fx, y = feature_analyzer(test, nlp)
    predicted = pipe.predict(Fx)
    print(classification_report(y, predicted, pipe.named_steps['clf'].classes_))

## Results on test set

** Baseline - is the word titlecase**

In [5]:
def prepare_baseline(data, nlp):
    dim = 0
    for x in data:
        dim += len(x)
    features = np.ndarray(dim, dtype='object')
    y = np.zeros(dim)
    i = 0
    for sent in data:
        for j in range(len(sent)):
            y[i] = sent[j][1]
            features[i] = {}
            features[i]['word'] = sent[j][0]
            features[i]['word_lower'] = sent[j][0].lower()
            i += 1
    return features, y  

pipe = make_classifier(train, prepare_baseline, nlp)
classify(test, pipe, prepare_baseline, nlp)

             precision    recall  f1-score   support

        0.0       0.94      1.00      0.97     82412
        1.0       0.38      0.02      0.04      5391

avg / total       0.91      0.94      0.91     87803



**With spacy features on uningrams**

In [6]:
def prepare_features_pos(data, nlp):
    dim = 0
    for x in data:
        dim += len(x)
    features = np.ndarray(dim, dtype='object')
    y = np.zeros(dim)
    i = 0
    for sent in data:
        sent_tokens = []
        for x in sent:
            sent_tokens.append(x[0])
        doc = Doc(nlp.vocab, words=sent_tokens)
        nlp.tagger(doc)
        for j in range(len(sent)):
            y[i] = sent[j][1]
            features[i] = {}
            features[i]['word'] = doc[j].text
            features[i]['word_lower'] = doc[j].lower_
            features[i]['word_lemma'] = doc[j].lemma_
#             features[i]['word_is_punct'] = doc[j].is_punct
#             features[i]['word_shape'] = doc[j].shape_
            features[i]['pos'] = doc[j].pos_
            i += 1
    return features, y

def show_features(train, nlp, analyzer):
    for x in train[:1]:
        print(x)
    xx, yy = analyzer(train[:1], nlp)
    print(xx.shape)
    print(xx)
    print(yy)    
    
show_features(train, nlp, prepare_features_pos)


pipe = make_classifier(train, prepare_features_pos, nlp)
classify(test, pipe, prepare_features_pos, nlp)

[['did', False], ['you', False], ['see', False], ['that', True], ['See', False], ['what', False], ['he', False], ['did', False], ['there', True], ['He', False], ['acknowledged', False], ['that', False], ['I', False], ['am', False], ['his', False], ['mom', False], ['and', False], ['that', False], ['I', False], ['am', False], ['supposed', False], ['to', False], ['love', False], ['him', False], ['just', False], ['like', False], ['every', False], ['other', False], ['mom', False], ['in', False], ['the', False], ['world', True]]
(32,)
[{'word': 'did', 'word_lower': 'did', 'word_lemma': 'do', 'pos': 'VERB'}
 {'word': 'you', 'word_lower': 'you', 'word_lemma': '-PRON-', 'pos': 'PRON'}
 {'word': 'see', 'word_lower': 'see', 'word_lemma': 'see', 'pos': 'VERB'}
 {'word': 'that', 'word_lower': 'that', 'word_lemma': 'that', 'pos': 'DET'}
 {'word': 'See', 'word_lower': 'see', 'word_lemma': 'see', 'pos': 'VERB'}
 {'word': 'what', 'word_lower': 'what', 'word_lemma': 'what', 'pos': 'NOUN'}
 {'word': 'he'

**With spacy features and 3-grams; without dependency parsing**

In [7]:
def prepare_features_pos_3gram(data, nlp):
    dim = 0
    for x in data:
        dim += len(x)
    features = np.ndarray(dim, dtype='object')
    y = np.zeros(dim)
    i = 0
    for sent in data:
        sent_tokens = []
        for x in sent:
            sent_tokens.append(x[0])
        doc = Doc(nlp.vocab, words=sent_tokens)
        nlp.tagger(doc)
        for j in range(len(sent)):
            y[i] = sent[j][1]
            features[i] = {}
            features[i]['word'] = doc[j].text
            features[i]['word_lower'] = doc[j].lower_
            features[i]['word_lemma'] = doc[j].lemma_
#             features[i]['word_is_punct'] = doc[j].is_punct
            features[i]['word_shape'] = doc[j].shape_
            features[i]['pos'] = doc[j].pos_
            for k in [1, -1]:
                prefix = '_'+str(k)
                if j+k < 0:
                    features[i]['word'+prefix] = '<S>'
                    features[i]['word_lower'+prefix] = '<S>'
                elif j+k >= len(sent):
                    features[i]['word'+prefix] = '</S>'
                    features[i]['word_lower'+prefix] = '</S>'
                else:
                    features[i]['word'+prefix] = doc[j+k].text
                    features[i]['word_is_title'+prefix] = doc[j+k].is_title
                    features[i]['word_lemma'+prefix] = doc[j+k].lemma_
                    features[i]['word_lower'+prefix] = doc[j+k].lower_
                    features[i]['pos'+prefix] = doc[j+k].pos_
                    features[i]['word_shape'+prefix] = doc[j+k].shape_
                    features[i]['word_is_punct'+prefix] = doc[j+k].is_punct
                    
            i += 1
    return features, y

show_features(train, nlp, prepare_features_pos_3gram)

pipe = make_classifier(train, prepare_features_pos_3gram, nlp)
classify(test, pipe, prepare_features_pos_3gram, nlp)


[['did', False], ['you', False], ['see', False], ['that', True], ['See', False], ['what', False], ['he', False], ['did', False], ['there', True], ['He', False], ['acknowledged', False], ['that', False], ['I', False], ['am', False], ['his', False], ['mom', False], ['and', False], ['that', False], ['I', False], ['am', False], ['supposed', False], ['to', False], ['love', False], ['him', False], ['just', False], ['like', False], ['every', False], ['other', False], ['mom', False], ['in', False], ['the', False], ['world', True]]
(32,)
[{'word': 'did', 'word_lower': 'did', 'word_lemma': 'do', 'word_shape': 'xxx', 'pos': 'VERB', 'word_1': 'you', 'word_is_title_1': False, 'word_lemma_1': '-PRON-', 'word_lower_1': 'you', 'pos_1': 'PRON', 'word_shape_1': 'xxx', 'word_is_punct_1': False, 'word_-1': '<S>', 'word_lower_-1': '<S>'}
 {'word': 'you', 'word_lower': 'you', 'word_lemma': '-PRON-', 'word_shape': 'xxx', 'pos': 'PRON', 'word_1': 'see', 'word_is_title_1': False, 'word_lemma_1': 'see', 'word_l

 {'word': 'world', 'word_lower': 'world', 'word_lemma': 'world', 'word_shape': 'xxxx', 'pos': 'NOUN', 'word_1': '</S>', 'word_lower_1': '</S>', 'word_-1': 'the', 'word_is_title_-1': False, 'word_lemma_-1': 'the', 'word_lower_-1': 'the', 'pos_-1': 'DET', 'word_shape_-1': 'xxx', 'word_is_punct_-1': False}]
[0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1.]
             precision    recall  f1-score   support

        0.0       0.97      0.99      0.98     82412
        1.0       0.75      0.47      0.58      5391

avg / total       0.95      0.96      0.95     87803



## Loading non case sensitive 3-grams from https://www.ngrams.info

In [8]:
import io, zipfile

ngrams_coca = {}
with zipfile.ZipFile("data\\w3_.zip") as myzip:
    with myzip.open("w3_.txt", 'r') as file_:
        f = io.TextIOWrapper(file_, encoding='ansi')
        line = f.readline()
        while line:
            try:
                p = line.split('\t')
                if len(p) == 4:
                    p[3] = p[3].strip()
                    d1 = {}
                    if p[1] in ngrams_coca: 
                        d1 = ngrams_coca[p[1]]
                    else:
                        ngrams_coca[p[1]] = d1
                    d2 = {}
                    if p[2] in d1: 
                        d2 = d1[p[2]]
                    else: 
                        d1[p[2]] = d2
                    d2[p[3]] = int(p[0])
            except Exception as e:
                print(line, str(e))
            line = f.readline()

In [9]:
def get_count(ngrams, words):
    d = ngrams
    i = 0
    n = len(words)
    while i < n and words[i] in d:
        d = d[words[i]]
        i += 1
    if i >= n: 
        return d
    return 0

words = ['did', 'you', 'see']
print(get_count(ngrams_coca, words))
ngrams_coca['did']['you']['see']


2452


2452

In [11]:
def quant(freq):
    return int(np.log(freq+1))

def prepare_with_ngram(data, nlp):
    dim = 0
    for x in data:
        dim += len(x)
    features = np.ndarray(dim, dtype='object')
    y = np.zeros(dim)
    i = 0
    for sent in data:
        sent_tokens = []
        for x in sent:
            sent_tokens.append(x[0])
        doc = Doc(nlp.vocab, words=sent_tokens)
        nlp.tagger(doc)
        for j in range(len(sent)):
            y[i] = sent[j][1]
            features[i] = {}
            features[i]['word'] = doc[j].text
            features[i]['word_lower'] = doc[j].lower_
            features[i]['word_lemma'] = doc[j].lemma_
#             features[i]['word_is_punct'] = doc[j].is_punct
            features[i]['word_shape'] = doc[j].shape_
            features[i]['pos'] = doc[j].pos_
            for k in [1, -1]:
                prefix = '_'+str(k)
                if j+k < 0:
                    features[i]['word'+prefix] = '<S>'
                    features[i]['word_lower'+prefix] = '<S>'
                elif j+k >= len(sent):
                    features[i]['word'+prefix] = '</S>'
                    features[i]['word_lower'+prefix] = '</S>'
                else:
                    features[i]['word'+prefix] = doc[j+k].text
                    features[i]['word_is_title'+prefix] = doc[j+k].is_title
                    features[i]['word_lemma'+prefix] = doc[j+k].lemma_
                    features[i]['word_lower'+prefix] = doc[j+k].lower_
                    features[i]['pos'+prefix] = doc[j+k].pos_
                    features[i]['word_shape'+prefix] = doc[j+k].shape_
                    features[i]['word_is_punct'+prefix] = doc[j+k].is_punct
            words = [features[i]['word_lower_-1'], features[i]['word_lower'], features[i]['word_lower_1']]
            nfreq = get_count(ngrams_coca, words)
            features[i]['ngram_freq'] = quant(nfreq)
                    
            i += 1
    return features, y

show_features(train, nlp, prepare_with_ngram)


[['did', False], ['you', False], ['see', False], ['that', True], ['See', False], ['what', False], ['he', False], ['did', False], ['there', True], ['He', False], ['acknowledged', False], ['that', False], ['I', False], ['am', False], ['his', False], ['mom', False], ['and', False], ['that', False], ['I', False], ['am', False], ['supposed', False], ['to', False], ['love', False], ['him', False], ['just', False], ['like', False], ['every', False], ['other', False], ['mom', False], ['in', False], ['the', False], ['world', True]]
(32,)
[{'word': 'did', 'word_lower': 'did', 'word_lemma': 'do', 'word_shape': 'xxx', 'pos': 'VERB', 'word_1': 'you', 'word_is_title_1': False, 'word_lemma_1': '-PRON-', 'word_lower_1': 'you', 'pos_1': 'PRON', 'word_shape_1': 'xxx', 'word_is_punct_1': False, 'word_-1': '<S>', 'word_lower_-1': '<S>', 'ngram_freq': 0}
 {'word': 'you', 'word_lower': 'you', 'word_lemma': '-PRON-', 'word_shape': 'xxx', 'pos': 'PRON', 'word_1': 'see', 'word_is_title_1': False, 'word_lemma_1

 {'word': 'world', 'word_lower': 'world', 'word_lemma': 'world', 'word_shape': 'xxxx', 'pos': 'NOUN', 'word_1': '</S>', 'word_lower_1': '</S>', 'word_-1': 'the', 'word_is_title_-1': False, 'word_lemma_-1': 'the', 'word_lower_-1': 'the', 'pos_-1': 'DET', 'word_shape_-1': 'xxx', 'word_is_punct_-1': False, 'ngram_freq': 0}]
[0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1.]


In [12]:
pipe = make_classifier(train, prepare_with_ngram, nlp)
classify(test, pipe, prepare_with_ngram, nlp)

             precision    recall  f1-score   support

        0.0       0.97      0.99      0.98     82412
        1.0       0.75      0.48      0.59      5391

avg / total       0.95      0.96      0.95     87803



In [15]:
from sklearn.externals import joblib
joblib.dump(pipe, 'pipe_swift.pkl') 

['pipe_swift.pkl']

In [26]:
def top_features(pipe, n):
    """Prints features with the highest coefficient values, per class"""
    vectorizer =  pipe.named_steps['vectorize']
    clf =  pipe.named_steps['clf']
    print(clf.coef_.shape)
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(clf.classes_):
        if i >= clf.coef_.shape[0]:
            break
        top = np.argsort(clf.coef_[i])
        reversed_top = top[::-1]
        print("%s:\n%s" % (class_label,
              "\n".join(feature_names[j] for j in reversed_top[:n])))

In [28]:
top_features(pipe, 20)

(1, 332087)
0.0:
word_lower_1=</S>
word_1=</S>
word_1=And
word_1=jesus
word_1=john
word_1=A
word_1=At
word_shape_1=Xx
word_shape=.xx
word_1=christmas
word_shape_1=xx'xx
word_1=With
word_lemma_1=i
word=WHAT
word_1=In
word_1=ah
word_1=jim
word_1=OH
word_1=vi
word_shape_1=x.


## Results on run-on-test.json

In [14]:
validation_data = pd.read_json('data\\run-on-test.json', orient='values', compression=None, typ='series')
classify(validation_data, pipe, prepare_features_pos_3gram, nlp)

             precision    recall  f1-score   support

        0.0       0.99      0.98      0.99      4542
        1.0       0.56      0.66      0.61       155

avg / total       0.97      0.97      0.97      4697

