In [255]:
import pandas
import numpy
import itertools
import random
import pickle

from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from multiprocessing import Pool, cpu_count

import pysymspell.symspell as symspell
from typos_functions import *

## Baseline model

In [252]:
MAX_DISTANCE = 2

class Baseline():
    """
    Typos correction model, based on SymSpell lookout algorithm
    
    https://github.com/wolfgarbe/SymSpell
    
    and simple Random Forest classifier, based on token frequencies
    and edit distance between typo and candidate.
    
    Requires file containing tokens frequencies in a format 'token, frequency'.
    
    Training data: dataframe indexed by 'id' and containing columns 'identifier', 'typo'.
    Testing data: dataframe indexed by 'id' and containing column 'typo'.
    
    """
    def __init__(self, freq_file):
        self.checker = symspell.SymSpell(max_dictionary_edit_distance=MAX_DISTANCE)
        self.checker.load_dictionary(freq_file)
        self.frequencies = read_frequencies(freq_file)
           
    def fit(self, train_file, cand_file=None, dump_file=None):
        train_df = pandas.read_pickle(train_file)
        self.identifiers = train_df.identifier.copy()
        
        if cand_file is None:
            self.candidates = self._create_candidates(train_df, 'cand_' + train_file)
        else:
            self.candidates = pandas.read_pickle(cand_file)
            
        self.train_matrix = self._create_matrix(self.candidates)
        self.train_labels = self._create_labels()
        self.model = RandomForestClassifier()
        
        self.model.fit(self.train_matrix, self.train_labels)
        
        if dump_file is not None:
            with open(dump_file, 'wb') as f:
                pickle.dump(self, f)
        
    def suggest(self, test_file, cand_file=None):
        test_df = pandas.read_pickle(test_file)
        if cand_file is None:
            test_candidates = self._create_candidates(test_df, 'cand_' + test_file)
        else:
            test_candidates = pandas.read_pickle(cand_file)
            
        test_matrix = self._create_matrix(test_candidates)
        test_proba = self.model.predict_proba(test_matrix)
        return suggest_corrections(test_candidates, test_proba[:, 1])
    
    def correct(self, test_file, cand_file=None):
        return correct(self.suggest(test_file, cand_file))
            
    def _freq(token):
        try:
            return self.frequencies[token]
        except KeyError:
            return 0
                
    def _create_candidates(self, data, cand_file):
        candidates = pandas.DataFrame(columns=['id', 'typo', 'candidate', 'typo_freq', 'cand_freq', 'distance'])
        for id, row in tqdm(data.iterrows()):
            last_dist = -1
            typo = row.typo
            for suggestion in checker.lookup(typo, 2, MAX_DISTANCE):
                if suggestion.distance != last_dist:
                    candidate = suggestion.term
                    candidates = candidates.append(pandas.DataFrame([[id, typo, candidate, freq(typo), 
                                                                      freq(candidate), suggestion.distance]], 
                                                                    columns=candidates.columns), ignore_index=True)
                    last_dist = suggestion.distance

            if last_dist == -1:
                candidates = candidates.append(pandas.DataFrame([[id, typo, typo, freq(typo), freq(typo), 0]], 
                                                                columns=candidates.columns), ignore_index=True)
        candidates.to_pickle(cand_file)
        return candidates
        
        
    def _create_labels(self):
        labels = []
        for ind, row in self.candidates.iterrows():
            labels.append(int(row.candidate == self.identifiers[row.id]))
            
        return numpy.array(labels)
    
    def _create_matrix(self, candidates):
        return numpy.array(candidates.loc[:, ['typo_freq', 'cand_freq', 'distance']], dtype='int')  

In [260]:
baseline = Baseline('frequencies.csv')
baseline.fit('train_c_15k_data.pkl', 'cand_train_c_15k_data.pkl', 'baseline_11k.pkl')
baseline_suggestions = baseline.suggest('test_c_15k_data.pkl', 'cand_test_c_15k_data.pkl')

print_suggestion_results(pandas.read_pickle('test_c_15k_data.pkl'), baseline_suggestions)

DETECTION SCORE

{'tn': 2303, 'fn': 543, 'tp': 1734, 'fp': 94}
Accuracy: 0.863714163457424
Precision: 0.9485776805251641
Recall: 0.761528326745718
F1: 0.8448233861144945

FIRST SUGGESTION SCORE

{'tn': 2303, 'fn': 944, 'tp': 1333, 'fp': 94}
Accuracy: 0.7779204107830552
Precision: 0.9341275402943238
Recall: 0.585419411506368
F1: 0.7197624190064795

FIRST TWO SUGGESTIONS SCORE

{'tn': 2396, 'fn': 763, 'tp': 1514, 'fp': 1}
Accuracy: 0.8365425759520753
Precision: 0.9993399339933994
Recall: 0.6649099692577953
F1: 0.7985232067510549

FIRST THREE SUGGESTIONS SCORE

{'tn': 2397, 'fn': 760, 'tp': 1517, 'fp': 0}
Accuracy: 0.8373983739837398
Precision: 1.0
Recall: 0.6662274923144489
F1: 0.7996837111228255
