In [1]:
import gzip
import json
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle

In [2]:
with open('run-on-test.json') as fr:
    test = json.load(fr)

In [3]:
y_test = [w[1] for s in test for w in s]
x_test = [w[0] for s in test for w in s]

#### Baseline classifier - bigrams only :)

In [4]:
bigrams = Counter()
with gzip.open('bigrams_v2.txt.gz') as fr:
    for line in fr.readlines():
        first, second, count = line.decode().strip().split('\t')
        bigrams[(first, second)] = int(count)

In [5]:
y_predicted = []
for s in test:
    for i in range(len(s)-1):
        first = s[i][0].lower()
        second = s[i+1][0].lower()
        if first.isalpha() \
        and second.isalpha():
            y_predicted.append( not bigrams[(first, second)] )
        else:
            y_predicted.append(False)
    y_predicted.append(False)

In [6]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

       False       0.97      0.94      0.96      4542
        True       0.12      0.25      0.17       155

   micro avg       0.92      0.92      0.92      4697
   macro avg       0.55      0.60      0.56      4697
weighted avg       0.95      0.92      0.93      4697



#### Training

In [7]:
with gzip.open('run-on-train-sm.json.gz', 'rt') as fr:
    train = json.load(fr)

In [8]:
x_train = [w[0] for s in train for w in s]
y_train = [w[1] for s in train for w in s]

In [14]:
def get_features(tokens):
    features_list = []
    for i, token in enumerate(tokens):
        features = {}
        for j in range(-4, 5):
            if 0 <= i+j < len(tokens):
                features[str(j)+'w'] = tokens[i+j].lower()
                features[str(j)+'t'] = tokens[i+j].istitle()
        if 0 <= i < len(tokens)-1:
            features['bg'] = bigrams[(features['0w'], features['1w'])] > 0
        
        features_list.append(features)
        
    return features_list

In [15]:
v = DictVectorizer()
train_features = get_features(x_train)
v_train = v.fit_transform(train_features)

In [16]:
clf = LogisticRegression(random_state=0, max_iter=100, solver='liblinear').fit(v_train, y_train)

#### Test

In [17]:
test_features = get_features(x_test)
v_test = v.transform(test_features)
y_predicted = clf.predict(v_test)

In [18]:
print (classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99      4542
        True       0.74      0.59      0.65       155

   micro avg       0.98      0.98      0.98      4697
   macro avg       0.86      0.79      0.82      4697
weighted avg       0.98      0.98      0.98      4697

