In [1]:
from collections import Counter
import pandas as pd
import re
import spacy, en_core_web_sm

In [1]:
DATA_DIR = '/Users/admin/edu/NLP/practical_NLP_course/data/'
BLOGS_PREP_EN_FILE = 'blog_authors_preped.csv.gzip'
SMILES = {":‑)", ":)", ":-]", ":]", ":-3", ":3", ":->", ":>", "8-)", "8)", ":-}", ":}", ":o)", ":c)", ":^)", "=]",
          "=)", ":‑D", ":D", "8‑D", "8D", "x‑D", "xD", "X‑D", "XD", "=D", "=3", "B^D", ":-))", ":‑(", ":(", ":‑c",
          ":c", ":‑<", ":<", ":‑[", ":[", ":-||", ">:[", ":{", ":@", ">:(", ":'‑(", ":'(", ":'‑)", ":')", ":-*", ":*",
          ":×", ";‑)", ";)", "*-)", "*)", ";‑]", ";]", ";^)", ":‑,", ";D", ":‑/", ":/", ":‑.", ">:\\", ">:/", ":\\",
          "=/", "=\\", ":L", "=L", ":S", ":‑|", ":|", ":$", ":‑X", ":X", ":‑#", ":#", ":‑&", ":&", "%‑)", "%)"}

The function makes features from text:
<ul>
<li>count of tokens</li>
<li>count of sentences</li>
<li>count of words</li>
<li>count of title-words</li>
<li>count of caps-words</li>
<li>count of punctuation</li>
<li>count of numbers</li>
<li>count of smiles</li>
<li>count of unique words</li>
<li>count of each POS (ADJ, ADV, NOUN etc.)</li>
    </ul>

In [2]:
def prepare_features(text):
    text = re.sub(r'[\s_]+', ' ', text).strip(' _')
    doc = nlp(text)
    cnt = Counter({'sent_cnt': 1})  # , 'text_sent': doc.sentiment})
    unique_lemmas = set()
    for token in doc:
        cnt['token_cnt'] += 1
        cnt['sent_cnt'] += token.is_sent_start if token.is_sent_start else 0
        cnt['word_cnt'] += token.is_alpha
        cnt['title_cnt'] += token.is_title
        cnt['caps_cnt'] += token.is_upper
        cnt['number_cnt'] += token.is_digit
        if token.is_punct:
            if token.string in SMILES:
                cnt['smiles_cnt'] += 1
            else:
                cnt['punct_cnt'] += 1
        if token.is_alpha:
            unique_lemmas.add(token.lemma_)
        cnt['pos_{}'.format(token.pos_)] += 1

        # print(token)
    cnt['uniq_cnt'] = len(unique_lemmas)
    return cnt

In [3]:
nlp = en_core_web_sm.load()

In [13]:
data = pd.read_csv(DATA_DIR+BLOGS_PREP_EN_FILE, encoding='utf-8', compression='gzip')

In [14]:
x_df = data.text.apply(prepare_features).apply(pd.Series).fillna(0).astype(int)

KeyboardInterrupt: 

Add other features, based on rates of particular columns
<ul>
    <li>punctuation per sentance</li>
    <li>punctuation share</li>
    <li>tokens per sentance</li>
    <li>words per tokens</li>
    <li>unique words rate</li>
    <li>title words rate</li>
    <li>caps words rate</li>
    <li>verb words rate</li>
    <li>adjectives words rate</li>
    <li>pron words rate</li>
</ul>

In [8]:
x_df['punct_rate'] = x_df['punct_cnt'] / x_df['token_cnt']
x_df['punct_sent_rate'] = x_df['punct_cnt'] / x_df['sent_cnt']
x_df['token_per_sent'] = x_df['token_cnt'] / x_df['sent_cnt']
x_df['word_rate'] = x_df['word_cnt'] / x_df['token_cnt']
x_df['uniq_rate'] = x_df['uniq_cnt'] / x_df['word_cnt']
x_df['title_rate'] = x_df['title_cnt'] / x_df['word_cnt']
x_df['caps_rate'] = x_df['caps_cnt'] / x_df['word_cnt']
x_df['verb_rate'] = x_df['pos_VERB'] / x_df['token_cnt']
x_df['adj_rate'] = x_df['pos_ADJ'] / x_df['token_cnt']
x_df['pron_rate'] = x_df['pos_PRON'] / x_df['token_cnt']

Save the file with features to the disk

In [None]:
x_df.to_csv(DATA_DIR+'blog_authors_features.csv.gzip', compression="gzip")