# Poverty Estimation

## Setup

In [56]:
SURVEY_NAME = "OPM"

In [57]:
## Libraries
import os, datetime
import numpy as np
import pandas as pd
#import geopandas as gpd
import json
#import rasterio
#from rasterio.plot import show
import matplotlib.pyplot as plt

from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, normalize
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import Ridge

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (BaggingClassifier, AdaBoostClassifier,
                              AdaBoostRegressor,
                              GradientBoostingClassifier, RandomForestClassifier,
                              RandomForestRegressor,
                             BaggingRegressor, GradientBoostingRegressor)
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, classification_report,
                            r2_score, mean_absolute_error, mean_squared_error)

from joblib import dump, load

import logging, os 

import grid_params as grids
import config as cf

In [58]:
OUT_DIR = os.path.join(cf.DROPBOX_DIRECTORY, 'data', SURVEY_NAME, 'FinalData', 'pov_estimation_results')

## Load Data

In [59]:
df = pd.read_csv(os.path.join(cf.DROPBOX_DIRECTORY, 'data', SURVEY_NAME, "FinalData", "Merged Datasets", "survey_alldata_clean.csv"))

In [60]:
if SURVEY_NAME == "OPM":
    df = df[df.year == 2014]

In [61]:
df.head()
#df = df[df.country_code != 'IA']
#df = df[df.country_code != 'TL']

Unnamed: 0,uid,urban_rural,year,survey_round,GID_3,GID_2,GID_1,NAME_3,NAME_2,NAME_1,...,fb_wp_estimate_mau_24,fb_wp_estimate_mau_25,fb_wp_estimate_mau_26,fb_wp_estimate_mau_27,fb_wp_estimate_mau_28,fb_wp_estimate_mau_29,fb_wp_estimate_mau_30,fb_wp_estimate_mau_31,fb_wp_estimate_mau_32,fb_wp_estimate_mau_33
3,1_PAK.7.7.4_1,U,2014,3,PAK.7.7.4_1,PAK.7.7_1,PAK.7_1,Rawalpindi,Rawalpindi,Punjab,...,,,,,,,,,,
7,10_PAK.7.7.4_1,R,2014,3,PAK.7.7.4_1,PAK.7.7_1,PAK.7_1,Rawalpindi,Rawalpindi,Punjab,...,,,,,,,,,,
11,100_PAK.7.3.2_1,R,2014,3,PAK.7.3.2_1,PAK.7.3_1,PAK.7_1,Jhang,Faisalabad,Punjab,...,,,,,,,,,,
14,101_PAK.7.3.2_1,R,2014,3,PAK.7.3.2_1,PAK.7.3_1,PAK.7_1,Jhang,Faisalabad,Punjab,...,,,,,,,,,,
18,102_PAK.7.3.2_1,R,2014,3,PAK.7.3.2_1,PAK.7.3_1,PAK.7_1,Jhang,Faisalabad,Punjab,...,,,,,,,,,,


In [62]:
df.shape

(602, 434)

## Functions

In [63]:
def trainmodel_valresult(df, country, est_type, target, parameters, feature_type):
    # DESCRIPTION: Estimate poverty for a specific country, feature set and 
    # poverty variable across a number of parameters
    # ARGS:
    # df: Dataframe
    # country: iso2
    # est_type: within_country or other_countries
    # target: dependent variable to predict
    # feature_type: features to use

    df = df.dropna(subset=[target])
    df.reset_index()
    
    # Within country prediction; predict across folds within a country
    if est_type == 'within_country_cv':
        df_traintest = df[df.country_code == country]
        df_traintest.reset_index()
        
        df_traintest['fold'] = df_traintest['within_country_fold']
    else:
        df['fold'] = 'fold_1'
        
    # Train model on all countries except country i; predict on country i
    if est_type == 'global_country_pred':
        df_traintest = df
        df_traintest.reset_index()
    
    # Train model on all countries in a continent, except country i; predict
    # on country x
    if est_type == 'continent_africa_country_pred':
        df_traintest = df[(df.country_code == country) | (df.continent_adj == 'Africa')]
        df_traintest.reset_index()
        
    if est_type == 'continent_americas_country_pred':
        df_traintest = df[(df.country_code == country) | (df.continent_adj == 'Americas')]
        df_traintest.reset_index()
        
    if est_type == 'continent_eurasia_country_pred':
        df_traintest = df[(df.country_code == country) | (df.continent_adj == 'Eurasia')]
        df_traintest.reset_index()
        
    # Train on all countries in continent x and predict on countries in continent y
    if est_type == 'continent':
        df_traintest = df
        df['fold'] = df['continent_adj']
           
    # Initialize results and predictions dataframes
    results_df = pd.DataFrame() # results iterating over params
    
    y_df = df_traintest[['uid', 'country_code']]
    y_df = y_df.copy()
    y_df['y'] = df_traintest[target]
    #y_df['y'] = df_traintest.loc[:, (target)]
    y_df['target'] = target
    y_df['feature_type'] = feature_type
    y_df['est_type'] = est_type
    
    model_i = 0
    for i in parameters['regressors']:
        for j in parameters[i]:
            
            pred_dict = {
                'regressor': i,
                'params': j,
                'country': country,
                'est_type': est_type,
                'target': target,
                'model_i': model_i,
                'feature_type': feature_type
             }

            fold = 0
            y_df_parami = pd.DataFrame()
            for split_id in df_traintest.fold.unique():
                
                #### Separate into train and test
                if 'country_pred' in est_type:
                    df_train = df_traintest[df_traintest.country_code != country]
                    df_test = df_traintest[df_traintest.country_code == country]
                elif est_type == 'continent':
                    df_train = df_traintest[df_traintest['fold'] == split_id]
                    df_test = df_traintest[df_traintest['fold'] != split_id]
                else:
                    df_train = df_traintest[df_traintest['fold'] != split_id]
                    df_test = df_traintest[df_traintest['fold'] == split_id]

                #### Select features
                if (feature_type == "fb"):
                    x_train = df_train.filter(regex='^fb_', axis=1)
                    x_test = df_test.filter(regex='^fb_', axis=1)
                    
                if (feature_type == "osm"):
                    x_train = df_train.filter(regex='^osm_', axis=1)
                    x_test = df_test.filter(regex='^osm_', axis=1)
                    
                if (feature_type == "gc_"):
                    x_train = df_train.filter(regex='^gc_', axis=1)
                    x_test = df_test.filter(regex='^gc_', axis=1)
                    
                if (feature_type == "l8"):
                    x_train = df_train.filter(regex='^l8_', axis=1)
                    x_test = df_test.filter(regex='^l8_', axis=1)
                    
                if (feature_type == "l8_viirs"):
                    x_train = df_train.filter(regex='^l8_|^viirs_', axis=1)
                    x_test = df_test.filter(regex='^l8_|^viirs_', axis=1)
                    
                if (feature_type == "all"):
                    x_train = df_train.filter(regex ='^fb_|^l8_|^viirs_|^osm_|^worldpop_|^gc_', axis=1)
                    x_test  = df_test.filter(regex  ='^fb_|^l8_|^viirs_|^osm_|^worldpop_|^gc_', axis=1)
                    
                #### Prep Y Data
                y_train = df_train[target]
                y_test = df_test[target]

                #### Prep X Data: Normalize
                x_scaler = StandardScaler().fit(x_train)

                x_train = x_scaler.transform(x_train)
                x_test = x_scaler.transform(x_test)

                ### Initialize regressor, fit data, then append model to list
                regressor = eval(i)(**j)
                trained = regressor.fit(x_train, y_train)

                ### Results
                y_pred = trained.predict(x_test)
                
                pred_dict['r2_score_' + str(split_id)] = r2_score(y_test, y_pred)
                
                y_dict_foldi = {
                    'uid': df_test.uid
                 }
                y_dict_foldi['y_' + str(model_i)] = y_pred
                
                y_df_foldi = pd.DataFrame.from_dict(y_dict_foldi)
                y_df_parami = y_df_parami.append(y_df_foldi, ignore_index=True)

                fold += 1

            y_df = y_df.merge(y_df_parami, on = 'uid', how = 'right')
            y_df.to_csv(os.path.join(OUT_DIR, 'individual_files',
                               'withincv_predicted_values_' + 
                               country + '_' +
                               est_type + '_' +
                               feature_type + '_' +
                               target + 
                               '.csv'))
                
            results_df = results_df.append(pred_dict, ignore_index=True)
            results_df.to_csv(os.path.join(OUT_DIR, 'individual_files',
                                           'withincv_results_' + 
                                           country + '_' +
                                           est_type + '_' +
                                           feature_type + '_' +
                                           target + 
                                           '.csv'))
            model_i += 1
        
    return results_df, y_df

In [64]:
parameters = grids.GRID_REGRESS

In [65]:
if SURVEY_NAME == 'DHS':
    target_vars = ['pca_allvars']
elif SURVEY_NAME == 'OPM':
    target_vars = ['pscores']

In [66]:
results_all_df = pd.DataFrame()
y_all_df = pd.DataFrame()

# 'fb', 'osm', 'l8', 'l8_viirs', 'all'
for feature_type_i in ['osm', 'fb', 'all']:
    for est_type_i in ['continent', 'continent_americas_country_pred', 'within_country_cv']:
        for cc_i in df.country_code.unique():
            for target in target_vars: # 'wealth_index_score'
                
                # For OPM survey, only do within_country_cv
                if SURVEY_NAME == "OPM":
                    if est_type_i != "within_country_cv":
                        continue
                      
                # For predicting country i using continent x, only use
                # other countries in the continent.
                if est_type_i == 'continent_africa_country_pred':
                    continent_i = df.continent_adj[df.country_code == cc_i].tolist()[0]
                    if continent_i != 'Africa':
                        continue
                        
                if est_type_i == 'continent_americas_country_pred':
                    continent_i = df.continent_adj[df.country_code == cc_i].tolist()[0]
                    if continent_i != 'Americas':
                        continue
                        
                if est_type_i == 'continent_eurasia_country_pred':
                    continent_i = df.continent_adj[df.country_code == cc_i].tolist()[0]
                    if continent_i != 'Eurasia':
                        continue
                        
                # Only need to run 'continent' once
                if est_type_i == 'continent':
                    if cc_i != 'AL':
                        continue
                
                # Print where at
                print(feature_type_i + ' // ' + est_type_i + ' // ' + cc_i + ' // ' + target)

                # Run ML Model
                results_df_i, y_df_i = trainmodel_valresult(df, cc_i, est_type_i, target, parameters, feature_type_i)

                # Grab results
                results_all_df = results_all_df.append(results_df_i, ignore_index=True)
                y_all_df = y_all_df.append(y_df_i, ignore_index=True)
                
                # Export CSVs; replace as process more models
                results_all_df.to_csv(os.path.join(OUT_DIR, 'results_fbonly_withincv.csv'))
                y_all_df.to_csv(os.path.join(OUT_DIR, 'ypred_fbonly_withincv.csv'))

osm // within_country_cv // PK // pscores


In [11]:
results_all_df.to_csv(os.path.join(OUT_DIR, 'results_fbonly_withincv.csv'))
y_all_df.to_csv(os.path.join(OUT_DIR, 'ypred_fbonly_withincv.csv'))

In [51]:
est_type = 'hello_country_pred'

In [53]:
'country_pred' in est_type

True