In [92]:
# Read data
# Data: newsela.com geography texts, Robert Heinlein "Stranger in the strange land"
with open('newsela_short_short', 'rb') as file:
    lines = file.readlines()
    

In [11]:
import spacy
nlp = spacy.load('en')

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn import linear_model
from sklearn.metrics import classification_report

In [127]:
# Generate train and test data
# Read sequence of sentences, convert it to list of tokens.
# When new sentence starts randomly decide if merge it with previous one.
# "Merge" = Remove last token of previous sentence (.!?) and mark its 
# last word as end of sentence (1 in train_classes vector) 

import random

train_tokens = []
train_classes = []
i = 0
sent_count = 0
for line in lines:
    doc = nlp(line.decode('utf-8'))
    sent_count += len(list(doc.sents))
    
    for sent in doc.sents:
        r = random.random()
        sent_end_removed = False
        if (r < 0.6) and (i > 1):
            i -= 1
            train_classes[i - 1] = 1
            sent_end_removed = True
            
        for word in doc:
            if (word.pos_ == 'SPACE') or (word.pos_ == 'X'):
                continue
            if (sent_end_removed == True):
                word_text = word.text
                # there was an attempt to lowercase first word of some sentences. But it was useless.
                train_tokens[i] = word_text
                train_classes[i] = 0
                sent_end_removed = False
            else:
                train_tokens.append(word.text)
                train_classes.append(0)
            i += 1


In [128]:
print 'Number of sentences: ', sent_count
print 'Number of broken sentences: ', sum(train_classes)
print 'Number of tokens: ', len(train_tokens)
print 'Number of tokens marked as sentence end: ', sum(train_classes)

Number of sentences:  7514
Number of broken sentences:  4473
Number of tokens:  582820
Number of tokens marked as sentence end:  4473


In [129]:
# Define feature set

def extractFeatures(tokens, i):
    features = dict()
    
    # Basic feature set
    
    features["word"] = tokens[i].lower()
    features["word-1"] = tokens[i-1].lower() if i > 1 else "NONE"
    features["word+1"] = tokens[i+1].lower() if i < (len(tokens) - 1) else "NONE" 
    
    features["left-bigram"] = tokens[i-2].lower() + "_" + tokens[i-1].lower() \
        if i > 1 else "NONE"
    features["right-bigram"] = tokens[i+1].lower() + "_" + tokens[i+2].lower() \
        if i < (len(tokens) - 2) else "NONE"
    
    features["is_capitalized"] = tokens[i].istitle() 
    features["is_capitalized-1"] = tokens[i-1].istitle() if i > 0 else "NONE"
    features["is_capitalized-2"] = tokens[i-2].istitle() if i > 1 else "NONE"
    features["is_capitalized+1"] = tokens[i+1].istitle() if i < (len(tokens) - 1) else "NONE"
    features["is_capitalized+2"] = tokens[i+2].istitle() if i < (len(tokens) - 2) else "NONE"
    
    # Improvements
    
    ind = i
    while (tokens[ind] not in [u'!', u'.', u'?']) and (ind >= 0):
        ind -= 1
    features["dist_to_last_punct"] = i - ind   

# No improvement, skip this feature  
#     ind = i
#     while (tokens[ind] not in [u'!', u'.', u'?']) and (ind < len(tokens)):
#         ind += 1
#     features["dist_to_next_punct"] = ind - i      

    features["left-3gram"] = tokens[i-3].lower() + "_" + tokens[i-2].lower() + "_" + tokens[i-1].lower() \
        if i > 2 else "NONE"
    features["right-3gram"] = tokens[i+1].lower() + "_" + tokens[i+2].lower() + "_" + tokens[i+3].lower() \
        if i < (len(tokens) - 3) else "NONE"
    features["is_capitalized+3"] = tokens[i+3].istitle() if i < (len(tokens) - 3) else "NONE" 
    features["is_capitalized-3"] = tokens[i-3].istitle() if i > 2 else "NONE"
    
    return features

In [130]:
# Extract features, split train and test set
# Downsample class of words that do not end sentence

train_data, train_labels, test_data, test_labels = [], [], [], []
for i in range(len(train_tokens)):
    r = random.random()
    if (train_classes[i] == 0) and (r > 0.07): #downsampling
        continue
        
    features = extractFeatures(train_tokens, i)

    r = random.random()
    if r < 0.8:
        train_data.append(features)
        train_labels.append(train_classes[i])
    else:
        test_data.append(features)
        test_labels.append(train_classes[i])            

In [131]:
print len(train_data), len(train_labels), sum(train_labels)
print len(test_data), len(test_labels), sum(test_labels)

36075 36075 3604
9021 9021 869


In [60]:
# Bsaseline: Each uppercased word starts new sentence
import numpy as np
predicted_labels_base = np.zeros(len(test_labels))
for i in range(len(test_labels) - 1):
    if test_data[i]["is_capitalized+1"] == True:
        predicted_labels_base[i] = 1

In [61]:
# Quality for baseline
target_names = [u'0', u'1']
print(classification_report(test_labels, predicted_labels_base, target_names=target_names))

             precision    recall  f1-score   support

          0       0.99      0.88      0.93      7933
          1       0.46      0.92      0.62       900

avg / total       0.94      0.88      0.90      8833



In [132]:
#Features to vectors
vec = DictVectorizer()
vec.fit(train_data)
x_train = vec.transform(train_data).toarray()
x_test = vec.transform(test_data).toarray()

In [133]:
# Fit model
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(x_train, train_labels)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [134]:
# Test model on generated test set
predicted_labels = logreg.predict(x_test)
target_names = [u'0', u'1']
print(classification_report(test_labels, predicted_labels, target_names=target_names))

             precision    recall  f1-score   support

          0       1.00      0.99      1.00      8152
          1       0.95      0.96      0.95       869

avg / total       0.99      0.99      0.99      9021



In [None]:
# # # Resuls for generated test set (results may be different because of random downsampling)
# I have more data to reduce these differences, but I get MemoryError :(

# #Basic feature set

#         precision    recall  f1-score   support

#           0       0.99      0.99      0.99      8154
#           1       0.94      0.96      0.95       923

# avg / total       0.99      0.99      0.99      9077

# # Add distance to last .!?
#              precision    recall  f1-score   support

#           0       1.00      0.99      1.00      7933
#           1       0.95      0.96      0.96       900

# avg / total       0.99      0.99      0.99      8833

# # Add 3-grams (no improvement here, but good results on given test set)
#              precision    recall  f1-score   support

#           0       0.99      0.99      0.99      8010
#           1       0.95      0.95      0.95       930

# avg / total       0.99      0.99      0.99      8940



In [24]:
# Read given test data

import json
json_file='../../../tasks/07-language-as-sequence/run-on-test.json'
json_data=open(json_file)
data = json.load(json_data)
json_data.close()

print len(data)

test_tokens_given = []
test_labels_given = []
for sentence in data:
    for token in sentence:
        test_tokens_given.append(token[0])
        if token[1] == True:
            test_labels_given.append(1) 
        else:
            test_labels_given.append(0)
print len(test_tokens_given), len(test_labels_given), sum(test_labels_given)

200
4697 4697 155


In [136]:
# Extract features from given tet data

test_data_given = []
for i in range(len(test_tokens_given)):
    
    features = extractFeatures(test_tokens_given, i)
    test_data_given.append(features)

print len(test_data_given)
# vectorize given test data
x_test_given = vec.transform(test_data_given).toarray()

4697


In [137]:
# Test model on given test set
predicted_labels_given = logreg.predict(x_test_given)
target_names = [u'0', u'1']
print(classification_report(test_labels_given, predicted_labels_given, target_names=target_names))

             precision    recall  f1-score   support

          0       0.98      0.99      0.99      4542
          1       0.67      0.36      0.47       155

avg / total       0.97      0.97      0.97      4697



In [None]:
# # # Resuls for given test set (results may be different because of random downsampling)

# # Basic feature set
#              precision    recall  f1-score   support

#           0       0.98      0.99      0.98      4542
#           1       0.58      0.39      0.47       155

# avg / total       0.97      0.97      0.97      4697


# # Add distance to last .!?
#              precision    recall  f1-score   support

#           0       0.98      0.99      0.99      4542
#           1       0.59      0.38      0.46       155

# avg / total       0.97      0.97      0.97      4697

# # Add 3-grams
#              precision    recall  f1-score   support

#           0       0.98      0.99      0.99      4542
#           1       0.64      0.41      0.50       155

# avg / total       0.97      0.97      0.97      4697

