# End of sentence recognition

read prepared file

In [50]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('lucene_corpus.csv')
df.replace(['False', 'True'], [0, 1], inplace=True)

In [51]:
counts = pd.read_csv('end_of_sents.tsv', header=None, sep='\t')
end_threegrams_count = counts[0].value_counts().to_dict()
end_threegrams_lem_count = counts[1].value_counts().to_dict()
threegrams_count = df['threegrams'].value_counts().to_dict()
threegrams_lem_count = df['threegrams_lemma'].value_counts().to_dict()

In [52]:
end_freq = {k: end_threegrams_count.get(k, 0.0)/v for k, v in threegrams_count.items()}
end_freq_lem = {k: end_threegrams_lem_count.get(k, 0.0)/v for k, v in threegrams_lem_count.items()}

In [53]:
df['end_freq'] = df['threegrams'].apply(lambda x: end_freq.get(x, 0.0))
df['end_freq_lem'] = df['threegrams_lemma'].apply(lambda x: end_freq_lem.get(x, 0.0))

In [54]:
pos_vect = CountVectorizer().fit(df['n_pos'])

In [55]:
pos_n_df = pd.DataFrame(pos_vect.transform(df['n_pos']).todense(),
                      columns=['n_{}'.format(x) for x in pos_vect.get_feature_names()], index=df.index)

In [56]:
pos_na1_df = pd.DataFrame(pos_vect.transform(df['na1_pos'].fillna('')).todense(),
                      columns=['na1_{}'.format(x) for x in pos_vect.get_feature_names()], index=df.index)

In [57]:
pos_nb1_df = pd.DataFrame(pos_vect.transform(df['nb1_n_lemma'].fillna('')).todense(),
                      columns=['nb1_{}'.format(x) for x in pos_vect.get_feature_names()], index=df.index)

In [58]:
model_columns = ['n_is_aplpha', 'n_is_digit', 'n_is_lower', 'n_is_punct', 'n_is_title',
                 'na1_is_aplpha', 'na1_is_digit', 'na1_is_lower', 'na1_is_punct', 'na1_is_title',
                 'na2_is_aplpha', 'na2_is_digit', 'na2_is_lower', 'na2_is_punct', 'na2_is_title',
                 'nb1_n_is_aplpha', 'nb1_n_is_digit', 'nb1_n_is_lower', 'nb1_n_is_punct', 'nb1_n_is_title',
                 'nb2_n_is_aplpha', 'nb2_n_is_digit', 'nb2_n_is_lower', 'nb2_n_is_punct', 'nb2_n_is_title',
                 'end_freq', 'end_freq_lem']

In [128]:
df.drop((df.loc[~df['nb1_n_is_aplpha'].fillna(False).isin({False, True})].index), inplace=True)
X_train = pd.concat([df[model_columns], pos_n_df, pos_na1_df, pos_nb1_df], axis=1, join="inner")

In [142]:
X_train.shape

(268135, 72)

In [130]:
for col in model_columns:
    if 'is_aplpha' in col:
        X_train[col].fillna(True, inplace=True)

X_train.fillna(False, inplace=True)

In [134]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score

clf = LogisticRegressionCV(scoring='roc_auc')
clf.fit(X_train, df['label'])

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring='roc_auc', solver='lbfgs', tol=0.0001,
           verbose=0)

In [135]:
clf.scores_

{True: array([[ 0.99917634,  0.9994749 ,  0.99967743,  0.99983718,  0.99986801,
          0.99985754,  0.9998751 ,  0.99988539,  0.99988651,  0.99988665],
        [ 0.99935213,  0.99950037,  0.99962607,  0.99977492,  0.99983657,
          0.99983034,  0.99980193,  0.999781  ,  0.99977516,  0.99977612],
        [ 0.99935521,  0.99949841,  0.99966205,  0.99984712,  0.99992987,
          0.99993416,  0.99992141,  0.99990828,  0.99990689,  0.99990672]])}

In [138]:
import numpy as np

def top_features(columns, clf, n):
    """Prints features with the highest coefficient values, per class"""
    feature_names = columns
    for i, class_label in enumerate(clf.classes_):
        top = np.argsort(clf.coef_[i])
        reversed_top = top[::-1]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in reversed_top[:n])))

In [139]:
top_features(X_train.columns, clf, 10)

False: end_freq end_freq_lem na1_is_title nb1_n_is_punct n_noun n_intj na1_cconj nb1_pron n_num na1_det


IndexError: index 1 is out of bounds for axis 0 with size 1

In [140]:
clf.coef_

array([[  3.38604596e-01,   3.24394267e-01,   1.32963724e-01,
          1.24591244e-01,  -6.17018383e-01,   4.27608916e-01,
          2.65479227e-01,  -1.76745991e+00,  -6.92800232e-01,
          1.86097201e+00,   2.25157860e-01,  -6.16602022e-02,
         -4.70088046e-01,  -2.07597681e-01,   7.50614425e-02,
         -3.36586872e-01,   6.93827485e-03,  -6.29054091e-01,
          1.14840442e+00,  -3.52797906e-01,   8.60275479e-02,
         -1.00310909e-01,   3.78065568e-01,   7.13395545e-02,
         -3.06617747e-01,   8.37726258e+00,   5.19636752e+00,
          1.18546941e-01,  -5.02568054e-01,   6.10242440e-01,
         -7.48646859e-01,  -1.56801697e-01,   9.90591048e-01,
          1.09153195e+00,   6.94066303e-01,  -7.44760723e-02,
          6.27280459e-02,   2.75666133e-01,   1.07329121e-01,
         -1.18037080e-03,  -4.10073914e-01,   1.72807148e-01,
         -3.01938507e-01,   2.34222210e-01,   4.61072119e-01,
          8.53603211e-01,   6.52293427e-01,   5.01215440e-01,
        