## Huff, Puff & Classify
### Example Notebook: Fuzzy String Classification

#### Package imports and setup

In [None]:
#import packages
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import os, os.path
import re
import seaborn as sns
from tqdm import tqdm_notebook

#import custom modules
import prep.prep_data as prep
import prep.prep_cv as cv
import model.fuzzy as fz

#magik
%matplotlib inline

#### User Globals
#TODO: Add instructions for user

In [None]:
#setup globals
#setup directories
CWD = os.getcwd()
HOME_DIR = os.path.abspath(os.path.join(CWD, os.pardir))
DATA_DIR = HOME_DIR + "/data"
DATA_FILENAME = "housing_data.csv"
RESULTS_DIR = HOME_DIR + "/results"

#setup lists of vars to work with
STR_VARS = ['housing_roof', 'housing_wall', 'housing_floor']
NUM_VARS = [s + '_num' for s in STR_VARS]
RANK_VARS = [s + '_rank' for s in STR_VARS]

#which variable do you want to predictn (currently: floor/wall/roof)
PRED_VAR = "housing_roof"
PRED_VAR = PRED_VAR + "_rank" #will always be using the strings to predict ranking

#setup a filter to select which surveys you want to work with
SVY_FILTER = ['MACRO_DHS']

#analytical options
CV_SAMPLE_PCT = .2 #hold out x% for testing
CV_SAMPLE_WT = "N" #which variable(if any) shall weight your test sample
CV_FOLDS = 3 #use a x-fold cross-validation env

#garbage lists
STR_GARBAGE = ['nan', 'other', 'not a dejure resident', 'not dejure resident']
RANK_GARBAGE = ['4', '5', '6', '7', '8', '9', 'n']

#dictionaries
PRED_DICT = {'natural':'1', 'rudimentary':'2', 'finished':'3'} #map categories back to ranks

#### Read in and prep the data

In [None]:
#read in data and clean the text, then subset to our gold standard MACRO DHS data
df = prep.read_then_clean(DATA_DIR + "/" + DATA_FILENAME, STR_VARS, SVY_FILTER)

#remove garbage codes from our string variables
df_clean = prep.remove_garbage_codes(df, STR_VARS, STR_GARBAGE)

#extract ranking values for roof/wall/floor, then remove non-informative values
#note that we only want to use ranks from 1-3, as these correspond to our final output ranks
df_clean = prep.extract_ranking(df_clean, NUM_VARS)
df_clean = prep.remove_garbage_codes(df_clean, RANK_VARS, RANK_GARBAGE)

#### Run a bifold cross validation exercise for "housing roof"
##### Since this is an example, we will only work on 25 unknown words from each cross-validation

In [None]:
#setup training/test data for a bifold cross validation, using the 'housing roof" variable to predict 
train_list = cv.cv_censor_col(df_clean, PRED_VAR, CV_SAMPLE_PCT, CV_SAMPLE_WT, CV_FOLDS)

#run bifold cross validation for "housing roof"
#run your cross-validation analysis
cv_distrib, cv_preds, cv_results, cv_df = fuzzy_cv(train_list, PRED_VAR, PRED_DICT,
                                                   subset=[0:25]) #only run on 25 words from each CV for speed

#output the results to csv
save_results_df(cv_results, out_dir, "cv_results")
save_results_df(cv_preds, out_dir, "cv_preds")
save_results_df(cv_df, out_dir, "cv_df")
save_results_df(cv_distrib, out_dir, "cv_distrib")

#### Results

In [None]:
#plot results
fz.fuzzy_density(pd.concat(cv_distrib), 'word', 
                 ['natural', 'rudimentary', 'finished'],
                 color_list={'natural':'r', 'rudimentary':'b', 'finished':'g'},
                 cutoff=75)