## Poverty Estimation with Facebook Data

In [8]:
import os, datetime
import numpy as np
import pandas as pd
#import geopandas as gpd
import json
#import rasterio
#from rasterio.plot import show
import matplotlib.pyplot as plt

from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, normalize
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import Ridge

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (BaggingClassifier, AdaBoostClassifier,
                              AdaBoostRegressor,
                              GradientBoostingClassifier, RandomForestClassifier,
                              RandomForestRegressor,
                             BaggingRegressor, GradientBoostingRegressor)
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, classification_report,
                            r2_score, mean_absolute_error, mean_squared_error)

from joblib import dump, load

import logging, os 

import grid_params as grids

In [9]:
data_dir = "/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data"
SURVEY_NAME = "DHS"

OUT_DIR = os.path.join(data_dir, SURVEY_NAME, 'FinalData', 'results')

### Load/Prep Data

In [10]:
fb_df = pd.read_csv(os.path.join(data_dir, SURVEY_NAME, 'FinalData', 'Individual Datasets', 'facebook_marketing_dau_mau_prop.csv'))
survey_df = pd.read_csv(os.path.join(data_dir, SURVEY_NAME, "FinalData", "Individual Datasets", "survey_socioeconomic.csv"))

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
df = survey_df.merge(fb_df, on = 'uid')
df = df.sample(frac=1).reset_index(drop=True)

### Functions

In [12]:
def trainmodel_valresult(df, country, est_type, target, parameters):
    # DESCRIPTION:
    # ARGS:
    # df: Dataframe
    # country: iso2
    # est_type: within_country or other_countries

    if est_type == 'within_country':
        df_country = df[df.country_code == country]
        
        id_cut = round(df_country.shape[0] / 4)

        df_val = df_country.iloc[:id_cut]
        df_traintest = df_country.iloc[id_cut:]
        
        df_traintest.reset_index()
        
    if est_type == 'other_countries':
        df_val = df[df.country_code == country]
        df_traintest = df[df.country_code != country]
        df_traintest.reset_index()
        
    if est_type == 'india':
        df_val = df[df.country_code == country]
        df_traintest = df[df.country_code == 'IA']
        df_traintest.reset_index()
        
    # Initialize
    kf = KFold(n_splits=5)
    results_df = pd.DataFrame() # results iterating over params

    for i in parameters['regressors']:
        for j in parameters[i]:
            
            pred_dict = {
                'regressor': i,
                'params': j,
                'country': country,
                'est_type': est_type,
                'target': target
             }

            fold = 0
            for train_index, test_index in kf.split(df_traintest):
                
                # Separate into train and test
                df_train = df_traintest.iloc[train_index]
                df_test = df_traintest.iloc[test_index]

                x_train = df_train.filter(regex='^estimate_mau_', axis=1)
                x_test = df_test.filter(regex='^estimate_mau_', axis=1)

                y_train = df_train[target]
                y_test = df_test[target]

                # Normalize
                x_scaler = StandardScaler().fit(x_train)

                x_train = x_scaler.transform(x_train)
                x_test = x_scaler.transform(x_test)

                ### Initialize regressor, fit data, then append model to list
                regressor = eval(i)(**j)
                trained = regressor.fit(x_train, y_train)

                ### Results
                y_pred = trained.predict(x_test)
                
                pred_dict['r2_score_' + str(fold)] = r2_score(y_test, y_pred)

                fold += 1

            results_df = results_df.append(pred_dict, ignore_index=True)
            results_df.to_csv(os.path.join(OUT_DIR, 'indiv_param_results',
                                           'results_' + 
                                           country + '_' +
                                           est_type + '_' +
                                           target + 
                                           '_fbonly.csv'))
                  
    # Results on validation set --------------------------------------
    results_df['r2_score_avg'] = (results_df['r2_score_0'] + 
                              results_df['r2_score_1'] + 
                              results_df['r2_score_2'] + 
                              results_df['r2_score_3'] + 
                              results_df['r2_score_4']) / 5

    results_df = results_df.sort_values(by=['r2_score_avg'], ascending=False)

    results_df_best = results_df.iloc[0]
    
    # Results on validation set --------------------------------------    
    x_traintest = df_traintest.filter(regex='^estimate_mau_', axis=1)
    x_val = df_val.filter(regex='^estimate_mau_', axis=1)

    # Prep X/Y
    y_traintest = df_traintest[target]
    y_val = df_val[target]

    # Normalize
    x_scaler = StandardScaler().fit(x_traintest)

    x_traintest = x_scaler.transform(x_traintest)
    x_val = x_scaler.transform(x_val)

    # Grab regressor/parameters
    i_best = results_df_best['regressor']
    j_best = results_df_best['params']
    
    # Train model
    regressor = eval(i_best)(**j_best)
    trained_best = regressor.fit(x_traintest, y_traintest)

    y_pred = trained.predict(x_val)
    
    # Results
    valid_dict = {
        'country': country,
        'est_type': est_type,
        'target': target,
        'r2_score_traintest_avg': results_df_best['r2_score_avg'],
        'r2_score_traintest_0': results_df_best['r2_score_0'],
        'r2_score_traintest_1': results_df_best['r2_score_1'],
        'r2_score_traintest_2': results_df_best['r2_score_2'],
        'r2_score_traintest_3': results_df_best['r2_score_3'],
        'r2_score_traintest_4': results_df_best['r2_score_4'],
        'regressor': i_best,
        'params': j_best,
     }

    valid_dict['r2_score_val'] = r2_score(y_val, y_pred)
    valid_dict['N_train'] = df_traintest.shape[0]
    valid_dict['N_val'] = df_val.shape[0]
    
    valid_df = pd.DataFrame() # final results
    valid_df = valid_df.append(valid_dict, ignore_index=True)
        
    y_dict = {
        'y': country,
        'est_type': est_type,
        'target': target,
        'y': y_traintest,
        'y_pred': y_pred,
     }
    y_df = pd.DataFrame() # final results
    y_df = y_df.append(y_dict, ignore_index=True)
    
    # Predicted and true values
    y_dict = {
        'country': country,
        'est_type': est_type,
        'target': target,
        'y': y_val,
        'y_pred': y_pred,
    }

    y_df = pd.DataFrame.from_dict(y_dict)

    return valid_df, y_df, trained_best

In [None]:
parameters = grids.GRID_REGRESS

val_df_all = pd.DataFrame()
ypred_df_all = pd.DataFrame()

In [None]:
for est_type_i in ['within_country', 'other_countries', 'india']:
    for cc_i in df.country_code.unique():
        print(est_type_i + ' ' + cc_i)
        
        if (est_type_i == 'india') & (cc_i == 'IA'):
            print("skip!")
        else:
            val_df_i, y_df_i, model = trainmodel_valresult(df, cc_i, est_type_i, 'asset_pca_1', parameters)
            
            val_df_all = val_df_all.append(val_df_i, ignore_index=True)
            ypred_df_all = ypred_df_all.append(y_df_i, ignore_index=True)
            
            dump(model, os.path.join(OUT_DIR, 'models', 'fbonly_model_' + est_type_i + '_' + cc_i + '_asset_pca_1' + '.joblib'))             

within_country IA
within_country BD
within_country PK
within_country KH
within_country PH
within_country TL
within_country NP
within_country MM
within_country TJ
within_country KY
other_countries IA
other_countries BD
other_countries PK
other_countries KH
other_countries PH
other_countries TL
other_countries NP
other_countries MM
other_countries TJ
other_countries KY
india IA
skip!
india BD


In [None]:
val_df_all.to_csv(os.path.join(OUT_DIR, 'results_fbonly.csv'))
ypred_df_all.to_csv(os.path.join(OUT_DIR, 'ypred_fbonly.csv'))