In [199]:
import pandas
import numpy
import itertools
import random
from tqdm import tqdm
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt

import pysymspell.symspell as symspell
from sklearn.ensemble import RandomForestClassifier
from time import gmtime, strftime

import multiprocessing
from multiprocessing import Pool
multiprocessing.cpu_count()

%matplotlib inline

## Common functions

In [207]:
def read_frequencies(file):
    common_frequencies = {}
    with open(file, 'r') as f:
        for line in f:
            split = line.split()
            common_frequencies[split[0]] = int(split[1])

def suggest_corrections(candidates, pred_proba):
    suggestions = {}
    id = -1
    typo = ''
    corrections = []
    for index in range(len(pred_proba)):
        if candidates.loc[index, 'id'] != id:
            if id != -1:
                suggestions[id] = list(sorted(corrections, key=lambda x:-x[1]))

            id = candidates.loc[index, 'id']
            typo = candidates.loc[index, 'typo']
            corrections = []

        corrections.append((candidates.loc[index, 'candidate'], pred_proba[index]))

    suggestions[id] = list(sorted(corrections, key=lambda x:-x[1]))  
    return suggestions

def correct(suggestions):
    corrections = {}
    for id in suggestions.keys():
        corrections[id] = suggestions[id][0][0]
    return corrections

## Score functions

In [209]:
def detection_score(typos, suggestions):
    """
    Calculates score of solution for typo detection problem
    
    typos - DataFrame which indexed by 'id' and has columns 'typo', 'corrupted'
    
    suggestions - {id : [(candidate, correct_prob)]}, candidates are sorted by correct_prob 
    in a descending order 
    """
    scores = {'tp': 0, 'fp' : 0, 'tn' : 0, 'fn' : 0}
    for id in typos.index:
        if typos.loc[id, 'corrupted']:
            if suggestions[id][0][0] != typos.loc[id, 'typo']:
                scores['tp'] += 1
            else:
                scores['fn'] += 1
        else:
            if suggestions[id][0][0] == typos.loc[id, 'typo']:
                scores['tn'] += 1
            else:
                scores['fp'] += 1
    return scores

def first_k_set(corrections, k):
    first_k = set()
    for correction, prob in corrections[:k]:
        first_k.add(correction)
    return first_k

def score_at_k(typos, suggestions, k):
    """
    Calculates score of solution for typo correction problem
    
    typos - DataFrame which indexed by 'id' and has columns 'typo', 'corrupted'
    
    suggestions - {id : [(candidate, correct_prob)]}, candidates are sorted by correct_prob 
    in a descending order 
    """
    scores = {'tp': 0, 'fp' : 0, 'tn' : 0, 'fn' : 0}
    for id in typos.index:
        if typos.loc[id, 'corrupted']:
            if typos.loc[id, 'identifier'] in first_k_set(suggestions[id], k):
                scores['tp'] += 1
            else:
                scores['fn'] += 1
        else:
            if typos.loc[id, 'identifier'] in first_k_set(suggestions[id], k):
                scores['tn'] += 1
            else:
                scores['fp'] += 1
    return scores

def best_correction_score(typos, corrections):
    assert typos.shape[0] == corrections.shape[0]
    scores = {'tp': 0, 'fp' : 0, 'tn' : 0, 'fn' : 0}
    for id in typos.index:
        if typos.loc[id, 'corrupted']:
            if corrections[id] == typos.loc[id, 'identifier']:
                scores['tp'] += 1
            else:
                scores['fn'] += 1
        else:
            if corrections[id]== typos.loc[id, 'identifier']:
                scores['tn'] += 1
            else:
                scores['fp'] += 1
    return scores
                
def accuracy(score):
    return (score['tp'] + score['tn']) / sum(score.values())

def precision(score):
    return score['tp'] / (score['tp'] + score['fp']) 

def recall(score):
    return score['tp'] / (score['tp'] + score['fn']) 
            
def f1(score):
    return 2 / (1 / precision(score) + 1 / recall(score))

def print_score_metrics(score):
    print(score)
    print('Accuracy:', accuracy(score))
    print('Precision:', precision(score))
    print('Recall:', recall(score))
    print('F1:', f1(score))
    
def print_suggestion_results(typos, suggestions):
    print('DETECTION SCORE', '\n')
    print_score_metrics(detection_score(typos, suggestions))
    print('FIRST SUGGESTION SCORE', '\n')
    print_score_metrics(score_at_k(typos, suggestions, 1))
    print('FIRST TWO SUGGESTIONS SCORE', '\n')
    print_score_metrics(score_at_k(typos, suggestions, 2))
    print('FIRST THREE SUGGESTIONS SCORE', '\n')
    print_score_metrics(score_at_k(typos, suggestions, 3))