# Title: Gates Investigation 1 QA
# Doc # : Gates-1000-
# Author: Charles Copley
# Date : 28 June 2017
# Revision: A

This includes an integration with the Rapid PRO API

In [1]:
#import libraries
import pandas as pd
import sqlalchemy 
import numpy as np
from bokeh.io import output_file, show
from bokeh.models import GeoJSONDataSource
from bokeh.plotting import figure
from bokeh.sampledata.sample_geojson import geojson
import psycopg2
import json
from bokeh.events import ButtonClick
from bokeh.models import Button
from random import random
from bokeh.layouts import column
from bokeh.models import Button
from bokeh.palettes import RdYlBu3
from bokeh.plotting import figure, curdoc
import os
from IPython.core.display import display, HTML
%matplotlib inline



In [91]:
#define the class for sampling
class SelectSample:
    #class to hold all the peripheral things required to sample the main DB
    #TBD 
    #--1. Unit Tests.
    #--2. Add capacity for joining new (as yet unseen) tables and information.
    #--3. Check the JSON file against the parent table name for column existence.
    #--4. Add message set query and message sequence number
    
    def __init__(self,config_json_file):
        #test configuration variables
        self.config_json_file = config_json_file
        self.read_json_config()
        
        self.master_db_name = self.data['master_db_name']#this will be added as part of the
        self.database_port = self.data['database_port']
        self.database_server = self.data['database_server']
        self.user = self.data['user']
        self.pwd = self.data['pwd']
        #configuration script
        self.id_column = self.data['id_column'] # column that is used for the unique identifier
        self.msisdn_column = self.data['msisdn_column'] # column that is used for the msisdn
        self.investigation_id = int(self.data['investigation_id']) # Investigation number tag
        self.wave_id = int(self.data['wave_id']) # Investigation number tag
        self.parent_table_name = self.data['parent_table_name']
        self.filtered_table_name = self.data['filtered_table_name']
        self.all_samples_table_name = self.data['all_samples_table_name']
        self.sampler_trigger_start_date = self.data['sampler_trigger_start_date']
        self.sampler_type = self.data['sampler_type']
        #variable to hold whether the parent table has duplication or not.
        self.duplicates = True
        
        print('master_db_name: ',self.master_db_name)
        print('database_port: ',self.database_port)
        print('database_server: ',self.database_server)
        print('database_user: ',self.user)
        print('database_pwd: ',self.pwd)
        print('unique_field: ',self.id_column)
        print('investigation_id: ',self.investigation_id)
        print('all_samples_table: ',self.all_samples_table_name)
        print('population_filter: ',self.data['filter'])
        print('field_to_split: ', self.data['field_to_randomize'])
        print('values_to_use: ', list(self.data['samples_from_field'].keys()))
        print('number_to_select: ', list(self.data['samples_from_field'].values()))
         #name of the table that keeps all the previous samples
        #definition of the filter to be applied to the entire population
        filterJsonVals = self.data['filter']
        nFilters = len(filterJsonVals)
        self.population_filter = filterJsonVals[0]
        for i in range(1,nFilters):
            print(filterJsonVals[i])
            self.population_filter = self.population_filter + """ and """ + filterJsonVals[i]
            
        self.group_filter = self.data['field_to_randomize']
        self.group_values = list(self.data['samples_from_field'].keys())
        self.group_sample_number = list(self.data['samples_from_field'].values())
        #self.group_values = ['afr_ZA','eng_ZA','xho_ZA']
        #self.group_sample_number = [1000,200,250]
        self.temp_full_sample_table_name = "temp_wave_%d"%(int(self.wave_id)) 
    
    def check_duplicates_msisdn(self):
        count_msisdn_column = pd.read_sql("""
            select count(%s) 
            from %s 
            """%(self.msisdn_column,self.parent_table_name),self.conn_db)
        count_distinct_msisdn_column = pd.read_sql("""
            select count(distinct(%s)) 
            from %s 
            """%(self.msisdn_column,self.parent_table_name),self.conn_db)
        if(count_msisdn_column['count'].item() !=count_distinct_msisdn_column['count'].item()):
            print('MSISDN Column "%s" is not unique. \n Count %d Count distinct %d'%(self.msisdn_column,count_msisdn_column['count'],count_distinct_msisdn_column['count']))
            self.duplicates = True
        elif(count_msisdn_column['count'].item() == count_distinct_msisdn_column['count'].item()):
            print('MSISDN Column "%s" is unique. Yay! \n Count %d Count distinct %d'%(self.msisdn_column,count_msisdn_column['count'],count_distinct_msisdn_column['count']))
            self.duplicates = False
        return(count_msisdn_column,count_distinct_msisdn_column)
    
    
    def check_duplicates(self):
        count_id_column = pd.read_sql("""
            select count(%s) 
            from %s 
            """%(self.id_column,self.parent_table_name),self.conn_db)
        count_distinct_id_column = pd.read_sql("""
            select count(distinct(%s)) 
            from %s 
            """%(self.id_column,self.parent_table_name),self.conn_db)
        if(count_id_column['count'].item() !=count_distinct_id_column['count'].item()):
            print('ID Column "%s" is not unique. \n Count %d Count distinct %d'%(self.id_column,count_id_column['count'],count_distinct_id_column['count']))
            self.duplicates = True
        elif(count_id_column['count'].item() == count_distinct_id_column['count'].item()):
            print('ID Column "%s" is unique. Yay! \n Count %d Count distinct %d'%(self.id_column,count_id_column['count'],count_distinct_id_column['count']))
            self.duplicates = False
        return(count_id_column,count_distinct_id_column)
    
    def read_json_config(self):
        with open(self.config_json_file) as data_file:
            self.data = json.load(data_file)
        return(self.data)
   
    def connect_to_db(self):
        self.conn_db=psycopg2.connect(dbname=self.master_db_name,user=self.user,password=self.pwd,
                port=self.database_port, host=self.database_server)
    
    def disconnect_from_db(self):
        self.conn_db.close()

    def read_test_data(self,file_to_read):
        test_data = pd.read_csv(file_to_read)
        return test_data
    
    def create_facility_augmented_database(self):
        self.parent_table_facility_code = "facility_code"
        self.clinic_table_facility_code = "facilitycode"
        self.clinic_facility_table = "clinic_facilities_with_gps"
        self.temp_augmented_table = "temp_table_for_augmented_facilities"
        conn = self.conn_db.cursor() #create cursor to execute the direct db commands over psycopg2

        sql_execution = """
                    CREATE TEMP TABLE %s as
                    (select * from %s as a
                    left join %s as b
                    on a.%s::integer = b.%s::integer);
                    """%(self.temp_augmented_table,self.parent_table_name,self.data["clinic_facility_table"], 
                        self.data["parent_table_facility_code"], self.data["clinic_table_facility_code"])
        conn.execute(sql_execution)
        conn.close()
    
    def create_filtered_database(self):
        conn = self.conn_db.cursor() #create cursor to execute the direct db commands over psycopg2
        sql_execution = """DROP TABLE IF EXISTS %s;"""%(self.filtered_table_name)
        conn.execute(sql_execution)
        sql_execution = """
        CREATE TEMP TABLE %s as (
                select *,
                '%s' as sampler_trigger_start_date,
                '%s' as sampler_type,
                %s as sampler_investigation_id, 
                %s as sampler_wave_id,
                '%s' as sampler_config_file,
                '%s' as sampler_master_db_name,
                '%s' as sampler_parent_table_name
                from %s 
                where %s 
                );
        """%(self.filtered_table_name,
             self.sampler_trigger_start_date,
             self.sampler_type,
             self.investigation_id,
             self.wave_id,
             self.config_json_file,
             self.master_db_name,
             self.parent_table_name,
             self.parent_table_name,
             self.population_filter,  
            )
        conn.execute(sql_execution)
        conn.close()
        
    def create_filtered_database_previous_samples(self):
        conn = self.conn_db.cursor() #create cursor to execute the direct db commands over psycopg2
        sql_execution = """DROP TABLE IF EXISTS %s;"""%(self.filtered_table_name)
        conn.execute(sql_execution)
        sql_execution = """
        CREATE TEMP TABLE %s as (
                select *,
                '%s' as sampler_trigger_start_date,
                '%s' as sampler_type,
                %s as sampler_investigation_id, 
                %s as sampler_wave_id,
                '%s' as sampler_config_file,
                '%s' as sampler_master_db_name,
                '%s' as sampler_parent_table_name
                from %s 
                where %s 
                and %s not in (select %s from %s)
                );
        """%(self.filtered_table_name,
             self.sampler_trigger_start_date,
             self.sampler_type,
             self.investigation_id,
             self.wave_id,
             self.config_json_file,
             self.master_db_name,
             self.parent_table_name,
             self.parent_table_name,
             self.population_filter,
             self.id_column,
             self.id_column,
             self.all_samples_table_name   
            )
        conn.execute(sql_execution)
        conn.close()
        
    def create_full_sample_temp_table(self):
        conn = self.conn_db.cursor() #create cursor to execute the direct db commands
        temp_table_name = "group_0"
        sql_execution = """DROP TABLE IF EXISTS %s;"""%(self.temp_full_sample_table_name)
        conn.execute(sql_execution)
        sql_execution = """CREATE TEMP TABLE  %s (LIKE %s);"""%(self.temp_full_sample_table_name,temp_table_name)
        conn.execute(sql_execution)
        conn.close()
    
    #check if the table is required for the gates samples   
    def check_exists_investigations_gates_table(self):
        conn = self.conn_db.cursor() #create cursor to execute the direct db commands
        sql_query = """SELECT EXISTS 
        (SELECT 1 FROM   pg_tables 
            WHERE  schemaname = 'public' AND tablename = '%s');"""%(self.all_samples_table_name)
        conn.execute(sql_query)
        self.exists = conn.fetchall()[0][0]
        conn.close()
        return self.exists

    def create_all_investigations_gates_table_if_not_exist(self):
        """create the table to hold all the gates investigations if required"""  
        #create cursor to execute the direct db commands
        if (not self.check_exists_investigations_gates_table()):
            conn = self.conn_db.cursor()
            sql_execution = """CREATE TABLE %s as (select * from %s where 1=2) ;"""%(self.all_samples_table_name, self.temp_full_sample_table_name)
            conn.execute(sql_execution)
            self.conn_db.commit()
            conn.close()

    def append_group_sample(self,group_id):
        conn = self.conn_db.cursor() #create cursor to execute the direct db commands
        temp_full_sample_table_name = "temp_wave_%d"%(self.wave_id)
        temp_table_name = "group_"+str(group_id)
        sql_execution = """insert into %s select * from %s;"""%(self.temp_full_sample_table_name,temp_table_name)
        result = conn.execute(sql_execution)
        self.conn_db.commit()
        conn.close()
        return(result)
    
    def append_sample_to_all_investigation(self):
        conn = self.conn_db.cursor() #create cursor to execute the direct db commands
        sql_execution = """insert into %s select * from %s;"""%(self.all_samples_table_name,self.temp_full_sample_table_name)
        result = conn.execute(sql_execution)
        self.conn_db.commit()
        conn.close()
        return(result)
    
    def sample_groups(self):
        for group_id in range(0,len(self.group_values)):
            print(group_id)
            (a,b,df) = self.get_group_sample(group_id,
                                             self.investigation_id,
                                             self.wave_id,
                                             self.filtered_table_name,
                                             self.population_filter,
                                             self.group_filter,
                                             self.group_values[group_id],
                                             int(self.group_sample_number[group_id]))
            print('group_id: ',group_id,self.group_values[group_id])
        return(a,b,df)
    
    def append_groups(self):
        for i in range(0,len(self.group_values)):
            self.append_group_sample(i)
    
    def get_group_sample(self,group_id,investigation_id, wave_id,parent_table_name,
                         population_filter,group_filter,group_value,samples):
        #function to get the random samples from a given group
        conn = self.conn_db.cursor() #create cursor to execute the direct db commands over psycopg2
        temp_table_name = "group_"+str(group_id)
        sql_execution = """DROP TABLE IF EXISTS %s;"""%(temp_table_name)
        conn.execute(sql_execution)
        sql_execution = """
        CREATE TEMP TABLE %s as (
                select *,
                    random() as random,
                    %d as group_id
                from %s 
                where %s 
                and %s in ('%s') 
                order by random asc 
                limit %d);
        """%(temp_table_name,group_id,
             parent_table_name,population_filter,
             group_filter,group_value,
             samples)
        conn.execute(sql_execution)
        sql_execution_count = """
                select count(*)
                from %s
                where %s 
                and %s in ('%s');
                """%(parent_table_name,population_filter,group_filter,group_value)
        count = conn.execute(sql_execution_count)
        sample_count = pd.read_sql("""
                select count(*) 
                from %s;
                """%(temp_table_name),self.conn_db)
        full_count = pd.read_sql("""
                select count(*) 
                from %s 
                where %s 
                and %s in ('%s')
                """%(parent_table_name,population_filter,group_filter,group_value),self.conn_db)
        sample = pd.read_sql("""
                select * from %s;
                """%(temp_table_name),self.conn_db)
        conn.close()
        if sample_count['count'][0] < samples:
            # Create a new instance of an exception
            print('error')
            sample_error = ValueError("The requested sample from %s is too large" % (group_value))
            raise sample_error
        return(sample_count, full_count, sample)

    def get_final_wave_sample(self):
        """Return the samples that will be added for the final wave"""
        full_sample = pd.read_sql("""
            select * 
            from %s 
            """%(self.temp_full_sample_table_name),self.conn_db)
        return full_sample
    
    def get_augmented_sample(self):
        """Return the samples that have had clinic locations augmented using the facility code"""
        full_sample = pd.read_sql("""
            select * 
            from temp_table_for_augmented_facilities
            """,self.conn_db)
        return full_sample
    
    def get_all_investigations_sample(self):
        """Return the samples already stored in the Gates investigation table"""
        all_investigations_sample = pd.read_sql("""
            select * 
            from %s 
            """%(self.all_samples_table_name),self.conn_db)
        return all_investigations_sample
    
    
    def json_config_fields(self):
        """Return the configuration fields used in the JSON configuration file"""
        return(self.group_filter,self.group_values)
    
    def plot_histogram(self):
        self.get_final_wave_sample().groupby(self.data['field_to_randomize']).count()[self.data['id_column']].plot('bar')
    
    def return_pandas_sql_query(self,sql):
        #function to return a pandas dataframe of the temporary tables created above:
        #Typically these will be:
        #1. parent_table_name
        #2. gates_investigation_%d_filtered
        #3. all_gates_investigation_samples
        #4. temp_wave_%d
        #5. group_%d
        return(pd.read_sql(sql,self.conn_db))
    
    def clinic_locations_of_participants(self):
#        sql_execution = """
#                select a.*,b.lat,b.lon from
#                temp_wave_1 as a
#                join
#                clinic_facilities_with_gps as b
#                on a.%s::integer = b.%s::integer
#                ;
#        """%(self.parent_table_facility_code,self.clinic_table_facility_code)
        sql_execution = """
                select * from
                temp_wave_1
                ;
        """
        return(pd.read_sql(sql_execution,self.conn_db))
    
    def generate_folium_map(self):
        import folium
        locations = self.clinic_locations_of_participants().dropna(subset=['lon','lat'])  
        m = folium.Map(location=[-30.92, 24.42],zoom_start=5,width=800, height=480)  
        marker_cluster = folium.MarkerCluster().add_to(m)
        for each in zip(locations.lat, locations.lon):
            folium.Marker(each).add_to(marker_cluster)
        m.save('base_map.html')
        return(m)
    
    def return_conn_db(self):
        return self.conn_db

Start by using the first wave definition- this defines the following:
- where is the population data located (tables, databases, user access etc.)?
- what filtering do we intend to apply to the population data?
- which fields need to be unique (this is simply checked- no action is taken)
- investigation id number
- which wave of the investigation should participants belong to?
- sampler_trigger_start_date when does the rpro trigger sending?
- sampler_type - what type of participant are we talking about?
- output table names


In [92]:

new_sample = SelectSample("Gates-1000-005- QA Testing/qa_definition_wave_1.json") #create a new object of type SelectSample
new_sample.connect_to_db()
new_sample.check_duplicates()
new_sample.check_duplicates_msisdn()
new_sample.create_filtered_database()
new_sample.sample_groups()  #sample the groups that are part of the SelectSample definition
new_sample.create_full_sample_temp_table() #Create a temp table to hold all of the new samples
new_sample.append_groups()
#only needed the very first time
new_sample.create_all_investigations_gates_table_if_not_exist()
new_sample.append_sample_to_all_investigation()
new_sample.disconnect_from_db()

master_db_name:  charlescopley
database_port:  6432
database_server:  localhost
database_user:  charlescopley
database_pwd:  parham
unique_field:  seed_uuid
investigation_id:  1
all_samples_table:  all_gates_qa_investigations_samples
population_filter:  ["ncmobi_active = 'false' and seed_message_set = 'nurseconnect.hw_full.1' and dhis2_province in ('wc Western Cape Province','gp Gauteng Province','fs Free State Province','kz KwaZulu-Natal Province','lp Limpopo Province')"]
field_to_split:  seed_language
values_to_use:  ['eng_ZA']
number_to_select:  ['2']
ID Column "seed_uuid" is unique. Yay! 
 Count 11 Count distinct 11
MSISDN Column "seed_msisdn_registration" is unique. Yay! 
 Count 11 Count distinct 11
0
group_id:  0 eng_ZA


In [93]:

new_sample = SelectSample("Gates-1000-005- QA Testing/qa_definition_wave_2.json") #create a new object of type SelectSample
new_sample.connect_to_db()
new_sample.check_duplicates()
new_sample.create_filtered_database_previous_samples() #must use this to filter against all previous samples
new_sample.sample_groups()  #sample the groups that are part of the SelectSample definition
new_sample.create_full_sample_temp_table() #Create a temp table to hold all of the new samples
new_sample.append_groups()
new_sample.append_sample_to_all_investigation()
new_sample.disconnect_from_db()

master_db_name:  charlescopley
database_port:  6432
database_server:  localhost
database_user:  charlescopley
database_pwd:  parham
unique_field:  seed_uuid
investigation_id:  1
all_samples_table:  all_gates_qa_investigations_samples
population_filter:  ["ncmobi_active = 'false' and seed_message_set = 'nurseconnect.hw_full.1' and dhis2_province in ('wc Western Cape Province','gp Gauteng Province','fs Free State Province','kz KwaZulu-Natal Province','lp Limpopo Province')"]
field_to_split:  seed_language
values_to_use:  ['eng_ZA']
number_to_select:  ['2']
ID Column "seed_uuid" is unique. Yay! 
 Count 11 Count distinct 11
0
group_id:  0 eng_ZA


In [94]:

new_sample = SelectSample("Gates-1000-005- QA Testing/qa_definition_wave_3.json") #create a new object of type SelectSample
new_sample.connect_to_db()
new_sample.check_duplicates()
new_sample.create_filtered_database_previous_samples()
new_sample.sample_groups()  #sample the groups that are part of the SelectSample definition
new_sample.create_full_sample_temp_table() #Create a temp table to hold all of the new samples
new_sample.append_groups()
new_sample.append_sample_to_all_investigation()
new_sample.disconnect_from_db()

master_db_name:  charlescopley
database_port:  6432
database_server:  localhost
database_user:  charlescopley
database_pwd:  parham
unique_field:  seed_uuid
investigation_id:  1
all_samples_table:  all_gates_qa_investigations_samples
population_filter:  ["ncmobi_active = 'false' and seed_message_set = 'nurseconnect.hw_full.1' and dhis2_province in ('wc Western Cape Province','gp Gauteng Province','fs Free State Province','kz KwaZulu-Natal Province','lp Limpopo Province')"]
field_to_split:  seed_language
values_to_use:  ['eng_ZA']
number_to_select:  ['4']
ID Column "seed_uuid" is unique. Yay! 
 Count 11 Count distinct 11
0
group_id:  0 eng_ZA


The cell below should error as there shouldn't be sufficient participants to sample further.

In [95]:

new_sample = SelectSample("Gates-1000-005- QA Testing/qa_definition_wave_4.json") #create a new object of type SelectSample
new_sample.connect_to_db()
new_sample.check_duplicates()
#new_sample.create_facility_augmented_database()
new_sample.create_filtered_database_previous_samples()
new_sample.sample_groups()  #sample the groups that are part of the SelectSample definition
new_sample.create_full_sample_temp_table() #Create a temp table to hold all of the new samples
new_sample.append_groups()

#a = new_sample.get_augmented_sample()

#only needed the very first time
#new_sample.create_all_investigations_gates_table_if_not_exist()
new_sample.append_sample_to_all_investigation()
new_sample.disconnect_from_db()

master_db_name:  charlescopley
database_port:  6432
database_server:  localhost
database_user:  charlescopley
database_pwd:  parham
unique_field:  seed_uuid
investigation_id:  1
all_samples_table:  all_gates_qa_investigations_samples
population_filter:  ["ncmobi_active = 'false' and seed_message_set = 'nurseconnect.hw_full.1' and dhis2_province in ('wc Western Cape Province','gp Gauteng Province','fs Free State Province','kz KwaZulu-Natal Province','lp Limpopo Province')"]
field_to_split:  seed_language
values_to_use:  ['eng_ZA']
number_to_select:  ['2']
ID Column "seed_uuid" is unique. Yay! 
 Count 11 Count distinct 11
0
error


ValueError: The requested sample from eng_ZA is too large

In [None]:
#produce a few plots of the data
new_sample = SelectSample("Gates-1000-005- QA Testing/qa_definition_wave_1.json") #create a new object of type SelectSample
new_sample.connect_to_db()
a = new_sample.json_config_fields()
final_wave = new_sample.plot_histogram()
new_sample.disconnect_from_db()

In [None]:
#final data set for eyeballing
new_sample = SelectSample("Gates-1000-005- QA Testing/qa_definition_wave_4.json") #create a new object of type SelectSample
new_sample.connect_to_db()
samples_to_upload = new_sample.get_final_wave_sample()
samples_to_upload.head(5)

In [103]:
new_sample = SelectSample("Gates-1000-005- QA Testing/qa_definition_wave_4.json") #create a new object of type SelectSample
new_sample.connect_to_db()
upload_samples = new_sample.get_all_investigations_sample()
new_sample.disconnect_from_db()

master_db_name:  charlescopley
database_port:  6432
database_server:  localhost
database_user:  charlescopley
database_pwd:  parham
unique_field:  seed_uuid
investigation_id:  1
all_samples_table:  all_gates_qa_investigations_samples
population_filter:  ["ncmobi_active = 'false' and seed_message_set = 'nurseconnect.hw_full.1' and dhis2_province in ('wc Western Cape Province','gp Gauteng Province','fs Free State Province','kz KwaZulu-Natal Province','lp Limpopo Province')"]
field_to_split:  seed_language
values_to_use:  ['eng_ZA']
number_to_select:  ['2']


In [107]:
upload_samples

Unnamed: 0,index,Name,seed_uuid,seed_msisdn_registration,seed_facility_code,seed_language,seed_dob,seed_registration_date,seed_message_set,seed_message_sequence,...,dhis2_province,sampler_trigger_start_date,sampler_type,sampler_investigation_id,sampler_wave_id,sampler_config_file,sampler_master_db_name,sampler_parent_table_name,random,group_id
0,4,Charles,9cadd74e-5a74-11e7-907b-a6006ad3dba0,27844301160,874006,eng_ZA,1984-12-01,2016-04-13,nurseconnect.hw_full.1,5,...,lp Limpopo Province,04-07-2017,nurse,1,1,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.113586,0
1,1,Matt,78708a98-5a74-11e7-907b-a6006ad3dba0,27826739844,293442,eng_ZA,1996-06-07,2016-11-25,nurseconnect.hw_full.1,44,...,gp Gauteng Province,04-07-2017,nurse,1,1,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.457787,0
2,10,Peter,0309f11a-5bdf-11e7-907b-a6006ad3dba0,27733682129,656442,eng_ZA,1992-11-16,2016-11-09,nurseconnect.hw_full.1,49,...,wc Western Cape Province,04-07-2017,nurse,1,2,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.154111,0
3,2,Cat,7dc1e2ee-5a74-11e7-907b-a6006ad3dba0,27747928763,333951,eng_ZA,1997-10-19,2016-12-17,nurseconnect.hw_full.1,9,...,fs Free State Province,04-07-2017,nurse,1,2,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.428192,0
4,9,Patricia,f8d730d6-5bde-11e7-907b-a6006ad3dba0,27848758702,207742,eng_ZA,1999-01-05,2016-08-03,nurseconnect.hw_full.1,35,...,kz KwaZulu-Natal Province,04-07-2017,nurse,1,3,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.107332,0
5,3,Lauren,97aafc68-5a74-11e7-907b-a6006ad3dba0,27767384103,207742,eng_ZA,1990-01-07,2016-12-24,nurseconnect.hw_full.1,9,...,kz KwaZulu-Natal Province,04-07-2017,nurse,1,3,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.328412,0
6,0,Jacqui,6cda1942-5a74-11e7-907b-a6006ad3dba0,27713519728,656442,eng_ZA,1984-09-20,2016-05-03,nurseconnect.hw_full.1,15,...,wc Western Cape Province,04-07-2017,nurse,1,3,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.765537,0
7,5,Kate,aac9476e-5a74-11e7-907b-a6006ad3dba0,27844715231,177751,eng_ZA,1991-10-09,2016-07-13,nurseconnect.hw_full.1,28,...,wc Western Cape Province,04-07-2017,nurse,1,3,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.859669,0


In [105]:
new_sample.disconnect_from_db()


In [106]:
a.count()

index                         8
Name                          8
seed_uuid                     8
seed_msisdn_registration      8
seed_facility_code            8
seed_language                 8
seed_dob                      8
seed_registration_date        8
seed_message_set              8
seed_message_sequence         8
ncmobi_active                 8
dhis2_province                8
sampler_trigger_start_date    8
sampler_type                  8
sampler_investigation_id      8
sampler_wave_id               8
sampler_config_file           8
sampler_master_db_name        8
sampler_parent_table_name     8
random                        8
group_id                      8
dtype: int64

In [219]:
#and the rapipro API
from temba_client.v2 import TembaClient
#read in all the data
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'limit_output': 1000})
api_key = '6dd5ab0dfe4836fe70429c1eb9a94e1513017fd8'
client = TembaClient('rapidpro.qa.mhealthengagementlab.org', api_key)


In [267]:
contacts  = client.get_contacts()
cc = contacts.first()


In [259]:
upload_samples['language'] = upload_samples.seed_language.str.replace('_ZA','').copy()
upload_samples['phone'] = upload_samples.seed_msisdn_registration.copy()
samples_to_upload = upload_samples

In [260]:
samples_to_upload

Unnamed: 0,index,Name,seed_uuid,seed_msisdn_registration,seed_facility_code,seed_language,seed_dob,seed_registration_date,seed_message_set,seed_message_sequence,...,sampler_investigation_id,sampler_wave_id,sampler_config_file,sampler_master_db_name,sampler_parent_table_name,random,group_id,language,phone,group_names
0,4,Charles,9cadd74e-5a74-11e7-907b-a6006ad3dba0,27844301160,874006,eng_ZA,1984-12-01,2016-04-13,nurseconnect.hw_full.1,5,...,1,1,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.113586,0,eng,27844301160,investigation_1_wave_1
1,1,Matt,78708a98-5a74-11e7-907b-a6006ad3dba0,27826739844,293442,eng_ZA,1996-06-07,2016-11-25,nurseconnect.hw_full.1,44,...,1,1,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.457787,0,eng,27826739844,investigation_1_wave_1
2,10,Peter,0309f11a-5bdf-11e7-907b-a6006ad3dba0,27733682129,656442,eng_ZA,1992-11-16,2016-11-09,nurseconnect.hw_full.1,49,...,1,2,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.154111,0,eng,27733682129,investigation_1_wave_2
3,2,Cat,7dc1e2ee-5a74-11e7-907b-a6006ad3dba0,27747928763,333951,eng_ZA,1997-10-19,2016-12-17,nurseconnect.hw_full.1,9,...,1,2,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.428192,0,eng,27747928763,investigation_1_wave_2
4,9,Patricia,f8d730d6-5bde-11e7-907b-a6006ad3dba0,27848758702,207742,eng_ZA,1999-01-05,2016-08-03,nurseconnect.hw_full.1,35,...,1,3,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.107332,0,eng,27848758702,investigation_1_wave_3
5,3,Lauren,97aafc68-5a74-11e7-907b-a6006ad3dba0,27767384103,207742,eng_ZA,1990-01-07,2016-12-24,nurseconnect.hw_full.1,9,...,1,3,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.328412,0,eng,27767384103,investigation_1_wave_3
6,0,Jacqui,6cda1942-5a74-11e7-907b-a6006ad3dba0,27713519728,656442,eng_ZA,1984-09-20,2016-05-03,nurseconnect.hw_full.1,15,...,1,3,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.765537,0,eng,27713519728,investigation_1_wave_3
7,5,Kate,aac9476e-5a74-11e7-907b-a6006ad3dba0,27844715231,177751,eng_ZA,1991-10-09,2016-07-13,nurseconnect.hw_full.1,28,...,1,3,Gates-1000-005- QA Testing/qa_definition_wave_...,charlescopley,gates_qa_investigation_1_parent_table,0.859669,0,eng,27844715231,investigation_1_wave_3


Here I create the group names using the data in the sample frame above

In [261]:
samples_to_upload['group_names'] = 'investigation_' + samples_to_upload['sampler_investigation_id'].astype(str) +'_wave_' + samples_to_upload['sampler_wave_id'].astype(str)

We obviously also only need the unique names ...

In [262]:
group_names = samples_to_upload['group_names'].unique()
group_names

array(['investigation_1_wave_1', 'investigation_1_wave_2',
       'investigation_1_wave_3'], dtype=object)

Here we create all the groups that will be required for the above. 
This could be better done by comparing against existing groups and then only making groups where necessary.


In [None]:
for group in iter(group_names):
    client.create_group(group) #create the groups
    print(group)

In [255]:
#client.create_field("sampler-trigger-start-date", "datetime")
#client.create_field("sampler-parent-table-name", "text")
#client.create_field("sampler-config-file", "text")
#client.create_field("sampler-wave-id", "text")
#client.create_field("sampler-type", "text")
#client.create_field("ncmobi-active", "text")
#client.create_field("seed-message-sequence", "text")
#client.create_field("seed-message-set", "text")
#client.create_field("seed-registration-date", "text")
#client.create_field("seed-dob", "text")
#client.create_field("ncmobi-active", "text")
#client.create_field("sampler-investigation-id", "text")
#client.create_field("dhis2-province", "text")
#client.create_field("seed-facility-code", "text")
#client.create_field("seed-msisdn-registration", "text")
#client.create_field("seed-uuid", "text")
#client.create_field("sampler-master-db-name","text")
#client.create_field("investigation-id", "numeric")
#client.create_field("wave-id", "numeric")

<temba_client.v2.types.Field at 0x116ec2048>

And finally we upload the data to rapid pro
We define the column names we will use from the python (pandas) dataframe we have constructed
We then do iterrows to go through each row of the pandas dataframe. 
The to_dict() method of the pandas row is super useful for the custom field uploading
I then use the create_contact() method

In [266]:
custom_field = ['seed_uuid',
                'seed_msisdn_registration',
                'seed_facility_code',
                'seed_language',
                'seed_dob',
                'seed_registration_date',
                'seed_message_set',
                'seed_message_sequence',
                'ncmobi_active',
                'dhis2_province',
                'sampler_trigger_start_date',
                'sampler_type',
                'sampler_investigation_id',
                'sampler_wave_id',
                'sampler_config_file',
                'sampler_master_db_name',
                'sampler_parent_table_name',
               ]

for index,row in samples_to_upload.iterrows():
    custom_field_dict = row[custom_field].to_dict()
    contact_urn = 'tel:+' + str(row['phone']) 
    contact_language = row['language']
    group_vals = row['group_names']
    client.create_contact(name = row['Name'],language=contact_language, urns=[contact_urn], groups = [group_vals], fields=custom_field_dict)
    print(custom_field_dict)

{'seed_uuid': '9cadd74e-5a74-11e7-907b-a6006ad3dba0', 'seed_msisdn_registration': 27844301160, 'seed_facility_code': 874006, 'seed_language': 'eng_ZA', 'seed_dob': '1984-12-01', 'seed_registration_date': '2016-04-13', 'seed_message_set': 'nurseconnect.hw_full.1', 'seed_message_sequence': 5, 'ncmobi_active': False, 'dhis2_province': 'lp Limpopo Province', 'sampler_trigger_start_date': '04-07-2017', 'sampler_type': 'nurse', 'sampler_investigation_id': 1, 'sampler_wave_id': 1, 'sampler_config_file': 'Gates-1000-005- QA Testing/qa_definition_wave_1.json', 'sampler_master_db_name': 'charlescopley', 'sampler_parent_table_name': 'gates_qa_investigation_1_parent_table'}
{'seed_uuid': '78708a98-5a74-11e7-907b-a6006ad3dba0', 'seed_msisdn_registration': 27826739844, 'seed_facility_code': 293442, 'seed_language': 'eng_ZA', 'seed_dob': '1996-06-07', 'seed_registration_date': '2016-11-25', 'seed_message_set': 'nurseconnect.hw_full.1', 'seed_message_sequence': 44, 'ncmobi_active': False, 'dhis2_provi