In [1]:
import json
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from prepare_data import prepare_data

In [2]:
with open('../../../tasks/02-structural-linguistics/blog2008.txt', 'r') as contents:
    lines = [line.strip() for line in contents.readlines() if line[0].isalpha()]
    train_sentences = [line for line in lines if line.endswith('.')][:100000]
#     sentences = '\n'.join(sentences)

In [None]:
with open('../../../tasks/07-language-as-sequence/run-on-test.json') as json_file:  
    data = json.load(json_file)[18:19]
    test_labels = []
    test_sentences = []
    sentences_len = len(data)
    for i, sentence in enumerate(data):
        temp_sentence = []
        temp_labels = []
        sentence_len = len(sentence)
        for j, token in enumerate(sentence):
            temp_sentence.append(token[0])
            if not (j == sentence_len - 1 and i == sentences_len - 1):
                temp_labels.append(token[1])
        test_sentences.append(' '.join(temp_sentence))
        test_labels.extend(temp_labels)

In [3]:
def extract(data):
    batch_size = 5000
    labels, features = [], []
    for i in range(0, len(data), batch_size):
        lab, feat = prepare_data(' '.join(data[i:i + batch_size]))

        labels.extend(lab)
        features.extend(feat)
    return labels, features

In [4]:
train_labels, train_features = extract(train_sentences)
x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.3, random_state=1)

In [5]:
clf = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('logistregress', LogisticRegression(solver='sag'))
])

In [7]:
clf.fit(x_train, y_train)
predicted = clf.predict(x_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00    717237
        True       0.97      0.91      0.94     34245

   micro avg       0.99      0.99      0.99    751482
   macro avg       0.98      0.95      0.97    751482
weighted avg       0.99      0.99      0.99    751482

