# Buidling Tenserflow version of the Readability Measure

## Import Data and take `READABILITY_SCORES` and `POS_DENSITY` features

In [1]:
import sys
sys.path.append('../../data')
from corpus import load_corpus
data = load_corpus('weebit')

X_train = data['X_train']
y_train = data['y_train']
y_train_onehot = data['y_train_onehot']
X_test = data['X_test']
y_test = data['y_test']
y_test_onehot = data['y_test_onehot']

FEATURES_NAMES = data['FEATURES_NAMES']
features = FEATURES_NAMES['READABILITY_SCORES'] + FEATURES_NAMES['POS_DENSITY']

X_train = X_train[features]
X_test = X_test[features]

Using TensorFlow backend.


## Evaluation Functions

In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import cross_val_score

def threshold_socre(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred) <= 1) / len(y_true)

def calc_scores(y_true, y_pred_probs):
    y_pred_avg = (y_pred_probs * np.arange(5)).sum(axis=1)
    y_pred_avg_classes = y_pred_avg.round().clip(0, 4).astype(int)
    return {'accuracy': accuracy_score(y_true, y_pred_avg_classes),
            'threshold': threshold_socre(y_true, y_pred_avg_classes)}

threshold_scorer = make_scorer(threshold_socre)
accuracy_scorer = make_scorer(accuracy_score)

def calc_scores_with_cv(model, X, y, cv=5):
    return {'accuracy': np.mean(cross_val_score(model, X, y, scoring=accuracy_scorer, cv=cv)),
            'threshold': np.mean(cross_val_score(model, X, y, scoring=threshold_scorer, cv=cv))}


## Build Softmax layer for the SVM Probabilites Calculation

In [16]:
from sklearn.svm import SVC

model_ovo = SVC(kernel='rbf', C=1, probability=True, decision_function_shape='ovo')

print('CV:', calc_scores_with_cv(model_ovo, X_train, y_train))

model_ovo.fit(X_train, y_train)

y_pred = model_ovo.predict(X_test)
y_pred_probs = model_ovo.predict_proba(X_test)
print('Test:', calc_scores(y_test, y_pred_probs))
print('Test - as normal classifier accuracy:', model_ovo.score(X_test, y_test))

CV: {'accuracy': 0.6990836779451828, 'threshold': 0.9068099410863921}
Test: {'accuracy': 0.6181318681318682, 'threshold': 0.929945054945055}
Test - as normal classifier accuracy: 0.7060439560439561


In [4]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier


def create_prob_calculator_ovo():
    prob_calculator_ovo = Sequential()
    prob_calculator_ovo.add(Dense(5, input_dim=10, activation='softmax'))

    prob_calculator_ovo.compile(optimizer='rmsprop',
                        loss='categorical_crossentropy',
                        metrics=['accuracy'])

    return prob_calculator_ovo


prob_calculator_ovo = KerasClassifier(build_fn=create_prob_calculator_ovo,
                                      epochs=10, 
                                      verbose=0)


X_dist_ovo_train = model_ovo.decision_function(X_train)
X_dist_ovo_test = model_ovo.decision_function(X_test)

print('CV:', calc_scores_with_cv(prob_calculator_ovo, X_dist_ovo_train, y_train))


prob_calculator_ovo.fit(X_dist_ovo_train, y_train_onehot)

y_pred_probs = prob_calculator_ovo.predict_proba(X_dist_ovo_test)
print('Test:', calc_scores(y_test, y_pred_probs))

CV: {'accuracy': 0.7819762111775527, 'threshold': 0.9387866635910356}
Test: {'accuracy': 0.6126373626373627, 'threshold': 0.9258241758241759}


## Train ??

In [4]:
from collections import defaultdict, Counter

from tqdm import tqdm

import nltk
nltk.download('brown')
from nltk.corpus import brown

TAG_REPLACER = {'NP'}
[w.replace('NP', 'NN').replace("WP","PR").replace("WD","DT")

[nltk_data] Downloading package brown to
[nltk_data]     /home/users/shlohod/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [5]:
word_tags = defaultdict(Counter)
for word, pos in tqdm(brown.tagged_words()):
    word_tags[word][pos] +=1

100%|██████████| 1161192/1161192 [00:08<00:00, 131586.00it/s]


In [14]:
word_tags['The']

Counter({'AT': 6725, 'AT-HL': 81, 'AT-TL': 452})