## Huff, Puff & Classify
### Example Notebook

#### Package imports and setup

In [None]:
#import packages
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
from tqdm import tqdm_notebook

#import custom modules
import prep.prep_data as prep
import prep.prep_cv as cv
import model.fuzzy as fz

#magik
%matplotlib inline

#### Read in and prep the data

In [None]:
#read in data and clean the text, then subset to our gold standard MACRO DHS data
df = prep.read_then_clean('../data/housing_data.csv',
                          ['housing_roof', 'housing_wall', 'housing_floor'],
                          ['MACRO_DHS'])

#remove garbage codes from our string variables
df_clean = prep.remove_garbage_codes(df, 
                                     ['housing_roof', 'housing_wall', 'housing_floor'],
                                     ['nan', 'other', 'not a dejure resident', 'not dejure resident'])

#extract ranking values for roof/wall/floor, then remove non-informative values
#note that we only want to use ranks from 1-3, as these correspond to our final output ranks
df_clean = prep.extract_ranking(df_clean, ['housing_roof_num', 'housing_wall_num', 'housing_floor_num'])
df_clean = prep.remove_garbage_codes(df_clean, 
                                     ['housing_roof_rank', 'housing_wall_rank', 'housing_floor_rank'],
                                     ['4', '5', '6', '7', '8', '9', 'n'])

#### Run a bifold cross validation exercise for "housing roof"
##### Since this is an example, we will only work on 25 unknown words from each cross-validation

In [None]:
#setup training/test data for a bifold cross validation, using the 'housing roof" variable to predict 
train_list = cv.cv_censor_col(df_clean, 'housing_roof_rank', .2, 'N', 2)

#run bifold cross validation for "housing roof"
cv_distrib, cv_preds, cv_results, cv_df = fuzzy_cv(train_list, 'housing_roof', 
                                                  {'natural':'1', 'rudimentary':'2', 'finished':'3'},
                                                  subset=[0:25]) #only run on 25 words from each CV for speed

#### Results

In [None]:
#plot results
fz.fuzzy_density(pd.concat(cv_distrib), 'word', 
                 ['natural', 'rudimentary', 'finished'],
                 color_list={'natural':'r', 'rudimentary':'b', 'finished':'g'},
                 cutoff=75)