In [1]:
import gzip
import json
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle

In [2]:
with open('run-on-test.json') as fr:
    test = json.load(fr)

In [3]:
y_test = [w[1] for s in test for w in s]
x_test = [w[0] for s in test for w in s]

#### Baseline classifier - bigrams only :)

In [4]:
bigrams = Counter()
with gzip.open('bigrams_v2.txt.gz') as fr:
    for line in fr.readlines():
        first, second, count = line.decode().strip().split('\t')
        bigrams[(first, second)] = int(count)

In [5]:
y_predicted = []
for s in test:
    for i in range(len(s)-1):
        first = s[i][0].lower()
        second = s[i+1][0].lower()
        if first.isalpha() \
        and second.isalpha():
            y_predicted.append( not bigrams[(first, second)] )
        else:
            y_predicted.append(False)
    y_predicted.append(False)

In [6]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

       False       0.97      0.94      0.96      4542
        True       0.12      0.25      0.17       155

   micro avg       0.92      0.92      0.92      4697
   macro avg       0.55      0.60      0.56      4697
weighted avg       0.95      0.92      0.93      4697



#### Training data

In [7]:
with gzip.open('run-on-train.json.gz', 'rt') as fr:
    train = json.load(fr)

In [8]:
x_train = [w[0] for s in train for w in s]
y_train = [w[1] for s in train for w in s]

In [9]:
def get_features(tokens):
    features_list = []
    for i, token in enumerate(tokens):
        features = {}
        features['1l'] = tokens[i].lower()
        features['1t'] = tokens[i].istitle()
        features['1a'] = tokens[i].isalpha()
        if i < len(tokens)-1:
            features['2l'] = tokens[i+1]
            features['2t'] = tokens[i+1].istitle()
            features['2a'] = tokens[i+1].isalpha()
        if i > 0:
            features['0l'] = tokens[i-1]
            features['0t'] = tokens[i-1].istitle()
            features['0a'] = tokens[i-1].isalpha()
        if features['1a'] and features['2a']:
            features['bg'] = bigrams[(features['1l'], features['2l'])] > 0
        
        features_list.append(features)
        
    return features_list

In [10]:
v = DictVectorizer()
train_features = get_features(x_train)
v_train = v.fit_transform(train_features)

In [15]:
clf = LogisticRegression(random_state=0, max_iter=100).fit(v_train, y_train)



#### Test

In [None]:
test_features = get_features(x_test)
v_test = v.transform(test_features)
y_predicted = clf.predict(v_test)

In [20]:
print (classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

       False       0.98      0.99      0.99      4542
        True       0.72      0.50      0.59       155

   micro avg       0.98      0.98      0.98      4697
   macro avg       0.85      0.75      0.79      4697
weighted avg       0.97      0.98      0.97      4697

