#### SANDBOX for code development

In [3]:
#import packages
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import os, os.path
import re
import seaborn as sns

#import custom modules
import prep.prep_data as prep
import prep.prep_cv as cv
import model.fuzzy as fz

#magik
%matplotlib inline

In [4]:
#setup globals
#setup directories
CWD = os.getcwd()
HOME_DIR = os.path.abspath(os.path.join(CWD, os.pardir))
DATA_DIR = HOME_DIR + "/data"
DATA_FILENAME = "example_data.csv"
RESULTS_DIR = HOME_DIR + "/results"

#setup lists of vars to work with
STR_VARS = ['housing_roof', 'housing_wall', 'housing_floor']
NUM_VARS = [s + '_num' for s in STR_VARS]
RANK_VARS = [s + '_rank' for s in STR_VARS]

#which variable do you want to predict (currently: floor/wall/roof)
DEP_VAR = "housing_roof"
PRED_VAR = DEP_VAR + "_rank" #will always be using the strings to predict ranking

#setup a filter to select which surveys you want to work with
SVY_FILTER = ['MACRO_DHS']

#analytical options
CV_SAMPLE_PCT = .2 #hold out x% for testing
CV_SAMPLE_WT = "N" #which variable(if any) shall weight your test sample
CV_FOLDS = 2 #use a x-fold cross-validation env

#garbage lists
STR_GARBAGE = ['nan', 'other', 'not a dejure resident', 'not dejure resident']
RANK_GARBAGE = ['4', '5', '6', '7', '8', '9', 'n']

#dictionaries
PRED_DICT = {'natural':'1', 'rudimentary':'2', 'finished':'3'} #map categories back to ranks

In [6]:
df = prep.read_then_clean(DATA_DIR + "/" + DATA_FILENAME, STR_VARS, SVY_FILTER)

~begin reading
data read!
~begin cleaning
data clean!
~applying filter


In [7]:
df_clean = prep.remove_garbage_codes(df, STR_VARS, STR_GARBAGE)
df_clean = prep.extract_ranking(df_clean, NUM_VARS)
df_clean = prep.remove_garbage_codes(df_clean, RANK_VARS, RANK_GARBAGE)
train_list = cv.cv_censor_col(df_clean, PRED_VAR, CV_SAMPLE_PCT, CV_SAMPLE_WT, CV_FOLDS)

{'nan': nan, 'other': nan, 'not a dejure resident': nan, 'not dejure resident': nan}
removing garbage from  housing_roof
removing garbage from  housing_wall
removing garbage from  housing_floor
defining ranking for  housing_roof_num
defining ranking for  housing_wall_num
defining ranking for  housing_floor_num
{'4': nan, '5': nan, '6': nan, '7': nan, '8': nan, '9': nan, 'n': nan}
removing garbage from  housing_roof_rank
removing garbage from  housing_wall_rank
removing garbage from  housing_floor_rank
sampling df, iteration # 0
sampling df, iteration # 1


In [10]:
#run your cross-validation analysis
cv_distrib, cv_preds, cv_results, cv_df = fuzzy_cv(train_list, DEP_VAR, PRED_DICT)

#output the results to csv
# save_results_df(cv_results, out_dir, "cv_results")
# save_results_df(cv_preds, out_dir, "cv_preds")
# save_results_df(cv_df, out_dir, "cv_df")
# save_results_df(cv_distrib, out_dir, "cv_distrib")

cv loop:   0%|          | 0/2 [00:00<?, ?it/s]

working on cv loop # 0
building corpus for rank # 1
building corpus for rank # 2
building corpus for rank # 3
extracting unknown strings
need to classify 285 unknown strings


HBox(children=(IntProgress(value=0, description='classifying unknown strings', max=285, style=ProgressStyle(de…

analyzing... cement bricks
~>corpus# 0
~>corpus# 1
~>corpus# 2
analyzing... sod mud with grass
~>corpus# 0
~>corpus# 1
~>corpus# 2
analyzing... wood planks
~>corpus# 0
~>corpus# 1
~>corpus# 2
analyzing... wood timber
~>corpus# 0





KeyboardInterrupt: 

In [None]:
#loop over all cross-validation results and plot them in chunks of 26 (4colsx6rows)

#plot results
fz.fuzzy_density(cv_distrib, 'word', 
                 ['natural', 'rudimentary', 'finished'],
                 color_list={'natural':'r', 'rudimentary':'b', 'finished':'g'},
                 cutoff=75)

In [None]:
def viz_to_pdf(df, graph, graph_dir, graph_filename, graph_title):
    """
    This is a demo of creating a pdf file with several pages,
    as well as adding metadata and annotations to pdf files.
    """

    import datetime
    import numpy as np
    from matplotlib.backends.backend_pdf import PdfPages
    import matplotlib.pyplot as plt
    
    #build filepath
    pdf_filepath = graph_dir + "/" + graph_filename

    # Create the PdfPages object to which we will save the pages:
    # The with statement makes sure that the PdfPages object is closed properly at
    # the end of the block, even if an Exception occurs.
    with PdfPages(pdf_filepath) as pdf:
        plt.figure(figsize=(3, 3))
        plt.title(graph_title)
        pdf.savefig(graph)  # or you can pass a Figure object to pdf.savefig
        plt.close()

        # We can also set the file's metadata via the PdfPages object:
        d = pdf.infodict()
        d['Title'] = 'Multipage PDF Example'
        d['Author'] = u'Jouni K. Sepp\xe4nen'
        d['Subject'] = 'How to create a multipage pdf file and set its metadata'
        d['Keywords'] = 'PdfPages multipage keywords author title subject'
        d['CreationDate'] = datetime.datetime(2009, 11, 13)
        d['ModDate'] = datetime.datetime.today()

In [5]:
#%%file ./model/cv.py

def fuzzy_cv(cv_list, base_var, rank_dictionary, subset=None, threshold=75, jupyter=False):

    #import packages
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    import pandas as pd
    import numpy as np
    
    if jupyter == True:
        from tqdm import tqdm_notebook as tqdm
    else: 
        from tqdm import tqdm as tqdm
    
    #import custom modules
    import sys
    sys.path.append('../hp_classify')
    import model.fuzzy as fz
    
    #setup objects
    rank_var = base_var + '_rank'
    og_var = rank_var + '_og'
    
    #TODO validate syntax
    rank_values = list(rank_dictionary.values())
    rank_keys = list(rank_dictionary.keys())
    
    #create lists to store loop outputs
    cv_distrib = []
    cv_preds = []
    cv_results = []
    cv_df = []
    
    #loop over each cross validation:
    for i in tqdm(range(len(cv_list)), desc="cv loop"):
        
        print('working on cv loop #', i)
        df = cv_list[i].copy() #subset the cv list to the current df

        #build corpus of known and unknown strings
        str_list, idk_strings = fz.build_corpus(df, base_var, rank_var, rank_values)
        
        #subset the unknown strings to allow for faster testing
        if subset != None:
            idk_strings = idk_strings[subset]
        
        #find distribution of scores for each string
        distrib = fz.fuzzy_scan(idk_strings, str_list)
        
        #TODO, output plots of distribution for analysis

        
        #predict class based on probability of exceeding similarity cutoff
        preds = fz.fuzzy_predict(distrib, rank_keys, 'word', threshold,
                                 rank_dictionary)

        #merge results back on the test data to validate
        out = df[df['train']==0]
        out = pd.merge(out,
                       preds,
                       left_on=base_var,
                       right_on='word',
                       how='left')

        #calculate success rate and tabulate
        out['success'] = np.where(out[og_var] == out['pred'], 1, 0)
        success_rate = pd.crosstab(out[~pd.isnull(out['pred'])]['success'], columns='count')
        
        #append results to prep for next loop
        cv_distrib.append(distrib)
        cv_preds.append(preds)
        cv_results.append(success_rate)
        cv_df.append(out)
        
    return(cv_distrib, cv_preds, cv_results, cv_df)


def save_results_df(df, out_dir, out_name):
    
    out_path = f'{out_dir}//{out_name}.csv'    
    print('saving df to', out_path)
    
    df = pd.concat(df)
    df.to_csv(out_path, header=False, sep=';')
    
    return(out_path)

Overwriting ./model/cv.py


In [8]:
#%%file ./model/fuzzy.py

#define function to replace meaningless values with NaNs
# def extract_ranking(df, vars_to_clean):
#     """This helper function is used to 

#     Args:
#     df (pandas df): This is a pandas df that has 
#     dep_var (str): This is the name of a column

#     Returns:
#         df_out: 
        
#     TODO: ?

#     """
        
#     df_out = df.copy()

#     #output a clean dataset
#     return 

def build_corpus(df, str_var, rank_var, rank_list):
    
    """" This function organizes the materials for which the rank is known into three broad categories of material quality:
    natural, rudimentary, finished. 
    
    Args:
        df (DataFrame): This is a panda DataFrame containing the distribution of the similarity scores
        str_var (list of str): This is a list of the strings for which the rank is known. The strings are classified
        within one of three categories of materials.
        rank_var (list of int):
        rank_list (list of int):
        
    Returns:
        distrib: The distribution of the similarity scores between each unknown material in the unknown list and known material
        in the corpus_list.
    """ 
    #import necessary modules
    import pandas as pd
    import numpy as np
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    
    out = []
    
    for x in rank_list:
        print("building corpus for rank #", x)
        out.append(df[df[rank_var]==x][str_var].values)    

    print("extracting unknown strings")
    other = df[~df[rank_var].isin(rank_list)][str_var].unique()
    other = other[~pd.isnull(other)] #cant classify NaN
    print("need to classify", len(other), "unknown strings")

    return(out, other)

def fuzzy_scan(unknown_list, corpus_list):
    
    """" This function takes a list of "unknown"  materials (i.e. materials outside our list of materials whose rank is known)
    as input and compare them to our corpus of known materials. The comparison is based on the computation of a score.
    The score reflects how similar the unknow strings are to each material within each of the three corpus.

    Args:
        unknown_list (list of str): This is a list of strings whose rank is unknown
        corpus_list (list of str): This is a list of the strings for which the rank is known. The strings are classified
        within one of three categories of materials.

    Returns:
        distrib (DataFrame): The distribution of the similarity scores between each unknown material in the unknown list and known material
        in the corpus_list.
    """ 
    
    #import necessary modules
    import pandas as pd
    import numpy as np
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    from tqdm import tqdm_notebook

    distrib = []

    #loop over each unknown string
    for x in tqdm_notebook(range(len(unknown_list)), desc="classifying unknown strings", leave=False): 
        unknown_str = unknown_list[x]
        print('analyzing...', unknown_str)

        out = []
        #loop over each corpus to compute similarity scores for all words in a given housing quality score
        for y in range(len(corpus_list)):
            print('~>corpus#', y)
            corpus = corpus_list[y]


            scores = []
            #loop over each word and compute the similarity score
            for z in range(len(corpus)):
                scores.append(fuzz.WRatio(unknown_str, corpus[z]))

            out.append(scores) #append scores to create a distribution for the entire corpus

        #append distributions of scores
        distrib.append(pd.DataFrame({'word': unknown_str, 
                                     'natural':pd.Series(out[0]), 
                                     'rudimentary':pd.Series(out[1]), 
                                     'finished':pd.Series(out[2]) #note series method used to overcome differing lengths
                                    }))


    return(pd.concat(distrib))

def fuzzy_predict(df, var_list, grouping, cutoff, dictionary):
    
    """ This function takes the distribution of the similarity scores between each unknown material in the unknown list
    and known material in the corpus_list, and based on this distribution predicts the rank of each unknown material.

    Args:
        df (DataFrame): This is a panda DataFrame containing the distribution of the similarity scores
        between each unknown material in the unknown list and known material in the corpus_list
        var_list (list of str): This is the list of the three broad categories of materials - natural, rudimentary, finished.
        grouping : the column to group the scores on.
        cutoff : a threshold chosen by the user to exclude the "noise" stemming from the little scores, which only
        share a few letters in common with the unknown material of interest
        dictionary : a dictionary that maps the three categories of materials - natural, rudimentary, finished, with an
        ordinal value - 1, 2, 3.
    Returns:
        out: The list of predicted ranks for each unknown material.
    """ 

    #calculate the probability that a classification score exceeds cutoff
    out = df.groupby(grouping)[var_list].apply(lambda c: (c>cutoff).sum()/len(c))
    
    #return column w/ max value and map to rank with dictionary
    out['pred'] = out[var_list].idxmax(axis=1).map(dictionary) 
    
    return(out)

def fuzzy_transform(df, var_list, grouping, fx, stub):

    for var in var_list:

        print('calculating prob for...', var)

        kwargs = {var+stub : lambda x: x[var]/x.groupby(grouping)[var].transform(fx)}
        df = df.assign(**kwargs)

    return(df)

F

Overwriting ./model/fuzzy.py


In [None]:
#%%file ./prep/prep_cv.py

#define necessary helper functions
def cv_censor_col(df, colname, pct=.2, weight_var=None, reps=5):
    
    """This function is used to create pandas dfs where a specified % of the values in a column have been censored
    and replaced with NaN, so that they can be predicted in a cross-validation methodology. It returns a list of such
    dfs that is the length of the reps argument.

    Args:
        df (pandas df): This is a pandas df that has columns with garbage values to be removed.
        colname (str): This is a string indicating the name of a column that you want to censor and later predict.
        pct (float): This is a value between 0-1 that indicates the fraction of values you want to censor. Default = 20%
        weight_var (str): This is a string indicating the column name is used to weighted the sample. Default = No weight.
        reps (int): This is an integer indicating the number of different training datasets to create. Default = 5x

    Returns:
        df_clean: This function returns a pandas df where the garbage codes have been replaced with NaN.
        
    TODO: ?

    """
    
    #import packages
    import pandas as pd
    import numpy as np
    
    out = []
    
    for x in range(reps):
            
        print("sampling df, iteration #", x)
    
        #first archive your old column in order to test later
        new_df = df.copy()
        new_df[colname + '_og'] = new_df[colname]
        new_df['train'] = 1 #set column to specify whether training or test data

        #draw a weighted sample if weight var is specified
        if weight_var != None:
            df_censor = new_df.sample(frac=pct, weights=weight_var)
        else:
            df_censor = new_df.sample(frac=pct)
            
        #now replace the sampled column with missing values in order to try and predict
        #note that replacement is only done on the sampled indices
        df_censor['train'] = 0 #note that this sample is no longer training data (it is test)
        df_censor[colname] = "replace_me"
        new_df.update(df_censor, overwrite=True)
        new_df[colname].replace("replace_me", np.nan, inplace=True)
        #TODO unsure if this is pythonic method but it seems like df.update won't replace values with NaN, 
        #as such, need to do this workaround
        
        #store the result (df with columns censored)
        out.append(new_df)
    
    #return the list of sampled dfs
    return(out)

In [None]:
#%%file ./prep/prep_data.py
#define necessary helper functions
def clean_text(text):
    """This function is used to clean a selection of text. 
    It uses several regular expressions and built in text commands in order to remove commonly seen 
    errors,
    nonsense values, 
    punctuation, 
    digits, and 
    extra whitespace.

    Args:
        text (str): This is a text value that needs to be cleaned.

    Returns:
        text: This function returns a cleaned version of the input text.
        
    TODO: Add functionality to impute a selected value for NaN or missing values?

    """
    #import necessary modules
    import re
    
    #force all vals in series to string
    text = str(text)
    
    #first remove uppercase
    text = text.lower()
    
    #remove common errors
    text = re.sub(r"\[.]", "", text) 
    text = re.sub(r"\<ff>", "", text)   
    text = re.sub(r"\<fb>", "", text)
    text = re.sub(r"\<a\d>", "", text)   
    text = re.sub(r"\<c\d>", "", text)   
    text = re.sub(r"\<d\d>", "", text)
    text = re.sub(r"\<e\d>", "", text)   
    text = re.sub(r"\<f\d>", "", text)   
    text = re.sub(r"\d+\.", "", text)

    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)   

    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    
    # remove any remaining digit codes
    text = re.sub(r"\d+", "", text)
    
    # remove any leading/trailing/duplicate whitespace
    text = re.sub(' +', ' ', text.strip())
    
    return text
    
#define master function
def read_then_clean(file_path, vars_to_clean, filter_series=None):
    """This is the master function for this module. It uses the previously defined helper functions,
    in order to output a clean dataset for user. It reads in a selected .csv file from a given filepath,
    and applies the previously defined cleaning functions to a list of variables provided by user.
    
    It can also optionally filter the df based on the survey series or TODO language.

    Args:
        file_path (str): This is a string indicating which file that you want to read in.
        vars_to_clean (list): This is a list of strings that indicate which columns you want to clean.
        filter_series (list): This is a list of strings that indicate which survey series to keep.

    Returns:
        df_clean: This is a pandas df that has columns of text values that have been cleaned using the helper function.
        
    TODO: Is it better to return an obj called df_clean to be more explicit to user?

    """
    #import necessary modules
    import pandas as pd
    import numpy as np
    
    #read in your data
    print("~begin reading")
    df_raw = pd.read_csv(file_path, low_memory=False)
    min_nrow = len(df_raw) #save the row count to test after cleaning and verify that rows are not being dropped
    print("data read!")
    
    #cleanup
    print("~begin cleaning")
    df_clean = df_raw.copy()
    for var in vars_to_clean:
        df_clean[var] = df_clean[var].apply(clean_text)
    print("data clean!")
    
    # Verify that the minimum rowcount continues to be met
    if len(df_clean) < min_nrow:
        class RowCountException(Exception):
            """Custom exception class.
            
            This exception is raised when the minimum row is unmet.

            """
            pass
        
        raise RowCountException("Minimum number of rows were not returned after cleaning. Data is being lost!")
        
    # Filter data if filter arguments are provided by user
    if filter_series != None:
        print("~applying filter")
        df_clean = df_clean[df_clean['survey_series'].isin(filter_series)]
        
    #output a clean dataset
    return df_clean

#define function to replace meaningless values with NaNs
def remove_garbage_codes(df, vars_to_clean, garbage_list):
    """This helper function is used to remove garbage values from a pandas df, replacing them with NaN.

    Args:
    df (pandas df): This is a pandas df that has columns with garbage values to be removed.
    vars_to_clean (list): This is a list of strings that indicate which columns you want to clean.
    garbage_list (list): This is a list of strings that indicate which garbage values to replace with NaN

    Returns:
        df_clean: This function returns a pandas df where the garbage codes have been replaced with NaN.
        
    TODO: set up an inverse argument so you can have opt to pass acceptable codes and NaN all others

    """
    
    #import necessary modules
    import pandas as pd
    import numpy as np
    
    df_clean = df.copy()
    
    # build dictionary to map all garbage values to NaN
    garb_dict = {}
    for string in garbage_list:
        garb_dict[string] = np.nan
    
    print(garb_dict)
    
    for var in vars_to_clean:
        print("removing garbage from ", var)
        df_clean[var].replace(garb_dict, inplace=True)
        
    #output a clean dataset
    return df_clean

#define function to replace meaningless values with NaNs
def extract_ranking(df, vars_to_clean):
    """This helper function is used to extract the ordinal rankings from numerical coding.

    Args:
    df (pandas df): This is a pandas df that has columns with garbage values to be removed.
    vars_to_rank (list): This is a list of strings that indicate which columns you want to extract ranks from.

    Returns:
        df_out: This function returns a pandas df with new vars added with the ordinal rank cols defined.
        
    TODO: ?

    """
    
    #import necessary modules
    import pandas as pd
    import numpy as np
    import re
    
    df_out = df.copy()
    
    for var in vars_to_clean:
        print("defining ranking for ", var)
        newcol = re.sub("_num", "_rank", var) 
        df_out[newcol] = df_out[var].astype(str).str[0]

    #output a clean dataset
    return df_out

In [110]:
%%file ./tests/test_prep.py
#write tests
"""This is a module used to test a module: "prep.py" and its relevant functions read_then_clean and clean_text

read_then_clean is a function that takes a csv with messy string values and 
creates then cleans a pandas df
using clean_text

This module tests that function by ensuring that it returns expected exceptions and
does not contain unexpected values.

This module also uses the opportunity of having the df loaded to tests the 
functions later in the data cleaning pipeline, including 
remove_garbage_codes, which removes unacceptable values and replaces them with NaN
and extract_ranking, which generates the ordinal ranking variable from an input numerical code
"""
# import packages
import pytest
import pandas as pd
import re

#import custom modules fpr testing
import sys 
sys.path.append('.')
import prep.prep_data as prep

#set globals for tests
#set globals for tests
FILEPATH = '../data/housing_data.csv'
CLEAN_COLS = ['housing_roof', 'housing_wall', 'housing_floor']

DIGITS = str([str(x) for x in range(100 + 1)])
PUNCT = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
SPACE = '     '

# if you compile the regex string first, it's even faster
re_dig = re.compile('\d')
re_punct = re.compile('\W+')
re_white = re.compile(' +')

STR_VARS = ['housing_roof', 'housing_wall', 'housing_floor']
NUM_VARS = [s + '_num' for s in STR_VARS]
RANK_VARS = [s + '_rank' for s in STR_VARS]

STR_GARBAGE = ['nan', 'other', 'not a dejure resident', 'not dejure resident']
RANK_GARBAGE = ['4', '5', '6', '7', '8', '9', 'n']

#read in the df using our function in order to pass to later tests
#read in df using your function and then using pandas regular csv read, then compare the resulting dfs
df = prep.read_then_clean(FILEPATH, CLEAN_COLS)
raw_csv = pd.read_csv(FILEPATH)

#also passed it through the rest of the cleaning pipeline on order to compare df to df_clean
df_clean = prep.remove_garbage_codes(df, STR_VARS, STR_GARBAGE)
df_clean = prep.extract_ranking(df_clean, NUM_VARS)
df_clean = prep.remove_garbage_codes(df_clean, RANK_VARS, RANK_GARBAGE)

def test_globals():
    """This function tests that the test globals are properly defined.
    """
    #assert that digits are removed
    assert re_dig.search(DIGITS) != None, "global doesn't contain digits!" 
    #assert that punctutation is removed
    assert re_punct.search(PUNCT) != None, "global doesn't contain punctuation!"
    #assert that excessive whitespace is removed
    assert re_white.search(SPACE) != None, "global doesn't contain whitespace!"
    

def test_clean_text():
    """This function tests that the clean text function is doing its job.
    """
    #assert that digits are removed
    assert re_dig.search(prep.clean_text(DIGITS)) == None, "clean_text did not remove the digits from test global." 
    #assert that punctutation is removed
    assert re_punct.search(prep.clean_text(PUNCT)) == None, "clean_text did not remove the punctuation from test global."
    #assert that excessive whitespace is removed
    assert re_white.search(prep.clean_text(SPACE)) == None, "clean_text did not remove the whitespace from test global."

def test_read_then_clean():
    """This function tests our master function and the subsquent data cleaning pipeline.
    """    
    #assert that our function did not add or remove rows
    assert len(raw_csv) == len(df), "read_then_clean function is modifying the original csv's length"
    assert len(df.columns) == len(raw_csv.columns), "read_then_clean function is modifying the original csv's width"
    
    #assert that our initial read function cleaned up the strings in the columns we provided
    #TODO: this test will fail if the columns were entirely clean to begin with (is this possible?)
    for x in CLEAN_COLS:
        assert (set(df[x].unique()) == set(raw_csv[x].unique())) == False, "string columns are unmodified"

def test_cleaning_pipeline():
    """This function tests our cleaning pipeline to make sure that 
    garbage values are removed and ranks are create
    """ 
    #assert that rankings were generated in the next step of the pipeline
    for x in RANK_VARS:
        #verify that it wasnt originally present in df
        assert (x in df) == False, "rank column present in raw data"
        #assert that this column was added 
        assert x in df_clean, "rank column was not added by extract_ranking fx"
        
    #assert that garbage was removed 
    for x in STR_VARS:
        for y in STR_GARBAGE:
            print(x, y)
            #assert that it is removed
            assert (y in df_clean[x].unique()) == False, "garbage values not removed from clean dataframe"    

Overwriting ./tests/test_prep.py


In [217]:
%%file ./tests/test_model.py
#write tests
"""This is a module used to test a module: "model.py" and its main functions, including build_corpus, fuzzy_scan,
and fuzzy_predict.

build_corpus is used to define corpora of words associated with a given ranking and also a corpus of unknown words.
Here, this functionality is tested to ensure that the corpora returned contain only words that are actually in the
specified rows and columns of the pandas df they were pulled from.

fuzzy_scan is a function that takes an unknown word and scans it for similarity against a list of known words that
subdivided by class. A distribution of values is returned that can be used to predict which class is most probable
for the unknown word. Here, several aspects of this functionality are tested, including xxx

fuzzy_predict is a function that takes a distribution of values for each class and predicts which class an unknown
word is most likely to be based on a given similarity threshold. Here, this function is tested by yyy


"""
# import packages
import pytest
import pandas as pd
import re
import numpy as np

#import custom modules fpr testing
import sys 
sys.path.append('.')
import prep.prep_data as prep
import model.fuzzy as fz

#set globals for tests
FILEPATH = '../data/example_data.csv'
CLEAN_COLS = ['housing_roof', 'housing_wall', 'housing_floor']

DIGITS = str([str(x) for x in range(100 + 1)])
PUNCT = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
SPACE = '     '

STR_VARS = ['housing_roof', 'housing_wall', 'housing_floor']
NUM_VARS = [s + '_num' for s in STR_VARS]
RANK_VARS = [s + '_rank' for s in STR_VARS]

STR_GARBAGE = ['nan', 'other', 'not a dejure resident', 'not dejure resident']
RANK_GARBAGE = ['4', '5', '6', '7', '8', '9', 'n']

RANK_LIST = ['1', '2', '3']

#read in example data using your function and then pass it through the cleaning pipeline
df = prep.read_then_clean(FILEPATH, CLEAN_COLS)
df_clean = prep.remove_garbage_codes(df, STR_VARS, STR_GARBAGE)
df_clean = prep.extract_ranking(df_clean, NUM_VARS)
df_clean = prep.remove_garbage_codes(df_clean, RANK_VARS, RANK_GARBAGE)

def test_build_corpus():
    """This function tests a function that is used to build corpora of known and unknown words from a df 
    that contains columns with string value descriptions. The testing is done to confirm that the resulting corpuses are built
    entirely from words that are present in the pandas df column that was passed in, and furthermore in the rows that result when
    subsetting by the rank class that they are supposed to be a part of.
    """  
    import numpy as np
    
    for x in STR_VARS:

        rank_var = x + "_rank"
        str_list, idk_strings = fz.build_corpus(df_clean, x, rank_var, RANK_LIST)

        #verify that each of the unknown strings exist in the appropriate column in the input pandas df
        for y in idk_strings:
            assert (y in df_clean[x].unique()) == True

        #verify that each of the known strings exist in the appropriate column in the input pandas df
        #note that here we subset the pandas df to the correct rank before testing the column values
        for rank, rank_num in zip(RANK_LIST, range(len(RANK_LIST))):
            for z in np.random.choice(str_list[rank_num], 5): #only pull 5 random strings and test for speed purposes

                assert (z in df_clean[df_clean[rank_var] == rank][x].unique()) == True
    
def test_fuzzy():
"""This function tests a series function that are used to predict the unknown ranking of string values using a 
training dataset in which the rankings are known for other string values. Corpora for each ranking are compiled and
then the unknown values are compared against these in order to predict the most likely ranking.

This functionality is tested by constructing a simulated dataframe in which we expect the predictions to be 100%
accurate. We will follow this dataframe through each function in the fuzzy modelling pipeline and then test to 
assert that all behaviour is as expected.
"""
    df_sim = pd.DataFrame({ 'piggy' : pd.Series(['straw', 'straws', 'stick', 'sticks', 'brick', 'bricks', 'brickz']),
                            'piggy_rank' : [1, 1, 2, 2, 3, 3, np.nan],
                            'piggy_rank_og' : [1, 1, 2, 2, 3, 3, 3],
                            'train' : [1, 1, 1, 1, 1, 1, 0]})
    sim_rank_list = [1,2,3] #save a list with the expected rank levels in your simulated df
    rank_dictionary = {'natural':1, 'rudimentary':2, 'finished':3}
    rank_values = list(rank_dictionary.values())
    rank_keys = list(rank_dictionary.keys())

    #build a corpus based on the simulated dataset
    str_list, idk_strings = fz.build_corpus(df_sim, 'piggy', 'piggy_rank', sim_rank_list)

    assert len(idk_strings) == 1

    #find distribution of scores for each string
    distrib = fz.fuzzy_scan(idk_strings, str_list)

    #the length of the output df should be equal to the length of the longest corpora
    assert len(distrib) == len(max(str_list, key=len)), "the output distribution df is not the correct length"

    #the output df should have the a# of columns that equals # of input rank categories + 1
    assert len(distrib.columns) == len(piggy_rank_list)+1, "the output distribution df is not the correct width"

    #the output df should have a column called word that contains only the values in idk_strings
    assert distrib.word.unique() in idk_strings

    #predict class based on probability of exceeding similarity cutoff of 75
    preds = fz.fuzzy_predict(distrib, rank_keys, 'word', 75, rank_dictionary)

    #the length of the prediction df should be equal to the length of the unknown words corpus
    assert len(preds) == len(idk_strings), "the output prediction df is not the correct length"

    #the prediction df should have # of columns that equals # of input rank categories + 1
    assert len(preds.columns) == len(piggy_rank_list)+1, "the output prediction df is not the correct width"

    #the prediction df should contain a column called "pred"
    assert ("pred" in preds.columns), "prediction column not being generated"

    #merge results back on the test data to validate
    out = df_sim[df_sim['train']==0]
    out = pd.merge(out,
                   preds,
                   left_on='piggy',
                   right_on='word',
                   how='left')

    #assert that the prediction was accurate, as expected
    assert np.allclose(out['piggy_rank_og'], out['pred'])

Overwriting ./tests/test_model.py


In [216]:
df_sim = pd.DataFrame({ 'piggy' : pd.Series(['straw', 'straws', 'stick', 'sticks', 'brick', 'bricks', 'brickz']),
                        'piggy_rank' : [1, 1, 2, 2, 3, 3, np.nan],
                        'piggy_rank_og' : [1, 1, 2, 2, 3, 3, 3],
                        'train' : [1, 1, 1, 1, 1, 1, 0]})
sim_rank_list = [1,2,3] #save a list with the expected rank levels in your simulated df
rank_dictionary = {'natural':1, 'rudimentary':2, 'finished':3}
rank_values = list(rank_dictionary.values())
rank_keys = list(rank_dictionary.keys())

#build a corpus based on the simulated dataset
str_list, idk_strings = fz.build_corpus(df_sim, 'piggy', 'piggy_rank', sim_rank_list)

assert len(idk_strings) == 1

#find distribution of scores for each string
distrib = fz.fuzzy_scan(idk_strings, str_list)

#the length of the output df should be equal to the length of the longest corpora
assert len(distrib) == len(max(str_list, key=len)), "the output distribution df is not the correct length"

#the output df should have the a# of columns that equals # of input rank categories + 1
assert len(distrib.columns) == len(piggy_rank_list)+1, "the output distribution df is not the correct width"

#the output df should have a column called word that contains only the values in idk_strings
assert distrib.word.unique() in idk_strings

#predict class based on probability of exceeding similarity cutoff of 75
preds = fz.fuzzy_predict(distrib, rank_keys, 'word', 75, rank_dictionary)

#the length of the prediction df should be equal to the length of the unknown words corpus
assert len(preds) == len(idk_strings), "the output prediction df is not the correct length"

#the prediction df should have # of columns that equals # of input rank categories + 1
assert len(preds.columns) == len(piggy_rank_list)+1, "the output prediction df is not the correct width"

#the prediction df should contain a column called "pred"
assert ("pred" in preds.columns), "prediction column not being generated"

#merge results back on the test data to validate
out = df_sim[df_sim['train']==0]
out = pd.merge(out,
               preds,
               left_on='piggy',
               right_on='word',
               how='left')

#assert that the prediction was accurate, as expected
assert np.allclose(out['piggy_rank_og'], out['pred'])

                                                                  

building corpus for rank # 1
building corpus for rank # 2
building corpus for rank # 3
extracting unknown strings
need to classify 1 unknown strings
analyzing... brickz
~>corpus# 0
~>corpus# 1
~>corpus# 2




In [215]:
preds.columns

Index(['natural', 'rudimentary', 'finished', 'pred'], dtype='object')