## Todo

* Find most unusual features (that appear at least twice in text). Ignore if they are not unusual for texts of that *topic*
* Give zscores of known and unknown texts to siamese network

In [28]:
# standard imports
import string
from collections import Counter

# third-party imports
import textacy
from spacy.en import English
from statistics import mean, stdev

def _normalize_counter(counter, c):
    """Divide all the values in a Counter by a constant and remove padding"""
    for key in counter:
        counter[key] = (counter[key] - 1) / c
    return counter

class TextAnalyser:
    def __init__(self, nlp=None):
        if nlp:
            self.nlp = nlp
        else:
            self.nlp = English()
            
        # alphabet for letter ratios
        self.alphabet = string.ascii_lowercase + "!?:;,.'- "
        
        # keys that we care about from textacy.stats
        self.basic_keys = ['n_long_words', 'n_monosyllable_words', 'n_polysyllable_words', 'n_sents', 'n_syllables', 'n_unique_words', 'n_words']
        
        # keys that we care about for textacy readability stats
        self.readability_keys = ['automated_readability_index','coleman_liau_index', 'flesch_kincaid_grade_level',
                                 'flesch_readability_ease', 'gulpease_index', 'gunning_fog_index', 'lix',
                                 'wiener_sachtextformel']
        
        # parts of speech that we care about from spacy (pos_ not tag_)
        self.pos_keys = ['ADJ', 'ADP', 'ADV', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SPACE', 'SYM', 'VERB', 'X']
        self.pos_keys_set = set(self.pos_keys)

    def get_named_features(self, text):
        # TODO: Add bigrams, trigrams?
        processed = self.nlp(text, entity=False, tag=True, parse=True)
        stats = textacy.text_stats.TextStats(processed)
        basic_stats = stats.basic_counts
        readability_stats = stats.readability_stats
        cleaned_text = ''.join(filter(lambda x: x in self.alphabet, text.lower() + self.alphabet))
        
        stats_ratios = {key: (basic_stats[key] / len(text)) for key in self.basic_keys}
        readability_ratios = {key: (readability_stats[key] / len(text)) for key in self.readability_keys}
        stats_ratios.update(readability_ratios)

        # get only the characters we care about 
        # append alphabet so that each character artificially appears once
        char_ratios = Counter(cleaned_text)
        char_ratios = _normalize_counter(char_ratios, len(text))

        # calculate pos ratios
        tags = [word.pos_ for word in processed if word.pos_ in self.pos_keys_set] + self.pos_keys
        pos_ratios = Counter(tags)
        pos_ratios = _normalize_counter(pos_ratios, len(processed)) # normalize by word length

        res = stats_ratios
        res.update(char_ratios)
        res.update(pos_ratios)
        return [(key, res[key]) for key in sorted(res)]
    
    def calculate_mean_and_std(self, extracted_texts):
        """finds unusual patterns by calculating mean and std deviation for a list of 
           extracted features and sorting by z-score"""
        means = []
        stds = []
        sample = extracted_texts[0]  # get one text for feature size and names
        num_features = len(sample)
        # fi = feature index
        for fi in range(num_features):
            u = mean([stat[fi][1] for stat in extracted_texts])
            o = stdev([stat[fi][1] for stat in extracted_texts])
            means.append((sample[fi][0], u))
            stds.append((sample[fi][0], o))
        return means, stds
    
    def calculate_z_scores(self, extracted_text, means, stds):
        """Calculate the zscores for each features of a single text (extractions)"""
        # z = (X - μ) / σ
        zscores = []
        num_features = len(extracted_text)
        for fi in range(num_features):
            try:
                zscore = (extracted_text[fi][1] - means[fi][1]) / stds[fi][1]
            except ZeroDivisionError:
                zscore = 0
            zscores.append((zscore, (text_features[fi][0])))
        return zscores
        
def vectorize(str_text):
    vecs = _vectorize(str_text)
    return np.array([x[1] for x in vecs])
    
    

In [3]:
nlp = English()

In [27]:
from pprint import pprint
ta = TextAnalyser(nlp)
texts = ["this is some boring text", "this is also some boring text", "more boring text", "and and and and and interesting"]
stats = [ta.get_named_features(text) for text in texts]
means, stds = ta.calculate_mean_and_std(stats)
zscores = ta.calculate_z_scores(stats[-1], means, stds)
print(sorted(zscores))






[(-1.4856533708302617, 'n_unique_words'), (-1.4255352930537208, 'o'), (-1.4016054795770003, 't'), (-1.3339880709789862, 'b'), (-1.3339880709789862, 'm'), (-1.3339880709789862, 'x'), (-1.2764444150951448, 'NOUN'), (-0.9919397397762781, 'flesch_readability_ease'), (-0.8589556903873334, 'DET'), (-0.8589556903873334, 'VERB'), (-0.8584192016190515, 'h'), (-0.7961318807627012, 'i'), (-0.7833494518006403, 'ADV'), (-0.7597124593094543, 'e'), (-0.7597124593094543, 'g'), (-0.7597124593094543, 'n_sents'), (-0.7080152423090047, 'gulpease_index'), (-0.6301260378126047, 'ADJ'), (-0.6101146782287797, 's'), (-0.5848909506882148, 'r'), (-0.5419025503979429, 'n_words'), (-0.5, 'l'), (0, '!'), (0, "'"), (0, ','), (0, '-'), (0, '.'), (0, ':'), (0, ';'), (0, '?'), (0, 'ADP'), (0, 'INTJ'), (0, 'NUM'), (0, 'PART'), (0, 'PRON'), (0, 'PROPN'), (0, 'PUNCT'), (0, 'SPACE'), (0, 'SYM'), (0, 'X'), (0, 'c'), (0, 'f'), (0, 'j'), (0, 'k'), (0, 'p'), (0, 'q'), (0, 'u'), (0, 'v'), (0, 'w'), (0, 'y'), (0, 'z'), (0.224713