In [1]:
import json
import spacy
nlp = spacy.load('en_core_web_md')

In [20]:
from sklearn.metrics import classification_report

In [6]:
def parse_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.loads(f.read())

In [7]:
train = parse_data("train_data.json")
test = parse_data("test.json")

In [16]:
import string
punc = list(string.punctuation)

In [29]:
def baseline(data):
    predicted = []
    true = []
    for tokens in data:
        for i in range(len(tokens)):
            true.append(int(tokens[i][1]))
            if i == 0 or tokens[i] in punc :
                predicted.append(0)
                continue
            if i < len(tokens) - 1 and tokens[i + 1] in punc:
                predicted.append(0)
                continue
            if  tokens[i][0].isupper():
                predicted.append(1)
                continue
            predicted.append(0)
    return true, predicted

In [42]:
x,y = baseline(train)
x1,y1 = baseline(test)

In [43]:
print (classification_report(x+x1,y+y1))

             precision    recall  f1-score   support

          0       0.98      0.98      0.98     31390
          1       0.02      0.01      0.02       799

avg / total       0.95      0.96      0.95     32189



In [92]:
def proces_tokens(tokens):
    doc = spacy.tokens.doc.Doc(nlp.vocab, words=tokens)
    processed = nlp.tagger(doc)
    return nlp.parser(processed)

In [93]:
for item in proces_tokens([x[0] for x in train[0]]):    
    print(item.text, item.lemma_, item.pos_, item.tag_)

Where where ADV WRB
are be VERB VBP
you -PRON- PRON PRP
going go VERB VBG
you -PRON- PRON PRP
'll 'll VERB MD
freeze freeze VERB VB
out out PART RP
there there ADV RB
you -PRON- PRON PRP
do do VERB VBP
n't n't ADV RB
even even ADV RB
have have VERB VB
a a DET DT
coat coat NOUN NN
. . PUNCT .


In [101]:
left_punc = ['!','...',',','.',':',';','?']

In [114]:
def word2features(doc, i):    
        word = doc[i]
        features = {
            'word' : word.text,
            'lemma': word.lemma_,
            'postag': word.pos_,
            'tag': word.tag_,
            'word[-3:]': word.text[-3:],
            'word[-2:]': word.text[-2:],
            'word.isupper()': word.text.isupper(),
            'word.istitle()': word.text.istitle(),
            'word.isdigit()': word.text.isdigit(),
        }
        if i > 0:
            word1 = doc[i-1]
            features.update({
                '-1:word' : word1.text,
                '-1:lemma': word1.lemma_,
                '-1:postag': word1.pos_,
                '-1:tag': word1.tag_,
                '-1:word.istitle()': word1.text.istitle(),
                '-1:word.isupper()': word1.text.isupper(),
                '-1:word.isdigit()': word1.text.isdigit(),
                '-1:ngram': word1.lemma_ + '_' + word.lemma_
            })
        else:
            features['BOS'] = True
        if i > 1:
            word2 = doc[i-2]
            features.update({
                '-2:word' : word2.text,
                '-2:lemma': word2.lemma_,
                '-2:postag': word2.pos_,
                '-2:tag': word2.tag_,
                '-2:word.istitle()': word2.text.istitle(),
                '-2:word.isupper()': word2.text.isupper(),
                '-2:word.isdigit()': word2.text.isdigit(),
                '-2:ngram': word2.lemma_ + '_' + word1.lemma_ + '_' + word.lemma_
            })
        if i < len(doc)-1:
            word1 = doc[i+1]
            features.update({
                '+1:word' : word1.text,
                '+1:lemma': word1.lemma_,
                '+1:postag': word1.pos_,
                '+1:tag': word1.tag_,
                '+1:word.istitle()': word1.text.istitle(),
                '+1:word.isupper()': word1.text.isupper(),
                '+1:word.isdigit()': word1.text.isdigit(),
                '+1:ngram': word.lemma_ + '_' + word1.lemma_
            })
        else:
            features['EOS'] = True
        if i < len(doc)-2:
            word2 = doc[i+2]
            features.update({
                '+2:word' : word2.text,
                '+2:lemma': word2.lemma_,
                '+2:postag': word2.pos_,
                '+2:tag': word2.tag_,
                '+2:word.istitle()': word2.text.istitle(),
                '+2:word.isupper()': word2.text.isupper(),
                '+2:word.isdigit()': word2.text.isdigit(),
                '+2:ngram': word2.lemma_ + '_' + word1.lemma_ + '_' + word.lemma_
            })
        else:
            features['EOS'] = True        
        return features

In [115]:
tokens = [x[0] for x in train[0]]   
doc = proces_tokens(tokens)
word2features(doc,0)

{'+1:lemma': 'be',
 '+1:ngram': 'where_be',
 '+1:postag': 'VERB',
 '+1:tag': 'VBP',
 '+1:word': 'are',
 '+1:word.isdigit()': False,
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+2:lemma': '-PRON-',
 '+2:ngram': '-PRON-_be_where',
 '+2:postag': 'PRON',
 '+2:tag': 'PRP',
 '+2:word': 'you',
 '+2:word.isdigit()': False,
 '+2:word.istitle()': False,
 '+2:word.isupper()': False,
 'BOS': True,
 'lemma': 'where',
 'postag': 'ADV',
 'tag': 'WRB',
 'word': 'Where',
 'word.isdigit()': False,
 'word.istitle()': True,
 'word.isupper()': False,
 'word[-2:]': 're',
 'word[-3:]': 'ere'}

In [116]:
print(doc)

Where are you going you 'll freeze out there you do n't even have a coat . 


In [118]:
def sent2features(sent):    
    tokens = [x[0] for x in sent]   
    doc = proces_tokens(tokens)
    return [word2features(doc,i) for i in range(len(doc))]

In [122]:
def sent2labels(sent):
    return [x[1] for x in sent]

In [132]:
def data2features(data):
    results = []
    for s in data:
        results += (sent2features(s))
    return results

def data2labels(data):
    results = []
    for s in data:
        results+=(sent2labels(s))
    return results

In [126]:
from sklearn import linear_model
from sklearn.feature_extraction import DictVectorizer

In [136]:
vec = DictVectorizer()
X_train = vec.fit_transform(data2features(train))
y_train = data2labels(train)

X_test = vec.transform(data2features(test))
y_test = data2labels(test)

In [137]:
logistic = linear_model.LogisticRegression()
logistic.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [141]:
predicted_train = logistic.predict(X_train)
predicted_test = logistic.predict(X_test)

In [143]:
print (classification_report(y_train, predicted_train))

             precision    recall  f1-score   support

      False       1.00      1.00      1.00     26848
       True       1.00      0.92      0.96       644

avg / total       1.00      1.00      1.00     27492



In [144]:
print (classification_report(y_test, predicted_test))

             precision    recall  f1-score   support

      False       0.98      1.00      0.99      4542
       True       0.80      0.38      0.52       155

avg / total       0.97      0.98      0.97      4697

