In [None]:
def merge_custom_sets(cmd_df, agg_sales_df, key="CUSTOMER_COD"):
  """ Function to merge the raw cmd data to sales aggregated to cust id
 
  Arguments: 
    cmd_df {DataFrame} -- cmd data
    agg_sales_df {str} -- aggregated sales data
    key {str} -- name of field to merge on, defaults to "CUSTOMER_COD"
 
  Returns: 
    output -- DataFrame for the customer master data
  """   
  cmd_df['CHANNEL_CUSTOM'] = cmd_df['Détail Canal'].replace(['Frui-ALIMENTATION GE'], 'AG')
  cmd_df['CHANNEL_CUSTOM'] = cmd_df['CHANNEL_CUSTOM'].replace(['Frui-SUPERETTE ET LI'], 'SUP')
  cmd_df['CHANNEL_CUSTOM'] = cmd_df['CHANNEL_CUSTOM'].replace(['Frui-CREMERIE','Frui-RESTAURANT / RO' ,'Frui-CAFE/CAFETERIA/','Frui-NIGHT CLUB','Frui-HOTELS',], 'HORECA')
  cmd_df['CHANNEL_CUSTOM'] = cmd_df['CHANNEL_CUSTOM'].replace(['Frui-FAST FOOD / PIZ', 'Frui-PIZZERIA'], 'SNACK')
  cmd_df['CHANNEL_CUSTOM'] = cmd_df['CHANNEL_CUSTOM'].replace(['Frui-DOUCHE','Frui-BUREAUX DE TABA','Frui-MOUKASSIRAT',"Frui-PATISSERIES",'Frui-FOYER','Frui-CREMERIE',"Frui-LOISIR",'Frui-SALLE DES FETES','Frui-MDN',"Frui-loisir","Frui-ADMINISTRATION","Frui-CYBER CAFE"], 'OTHER')
  
  output = cmd_df.\
    assign(cmd="cmd"). \
    merge(
      agg_sales_df.assign(sales="sales"), 
      on="CUSTOMER_COD", 
      how="outer"
    ). \
    assign(
      data_source= lambda x: [str(y) + '_' + str(z) for y, z in zip(x["cmd"], x["sales"])]
    ). \
    fillna({"inclusion_flag": -1})
  
  return output

In [None]:
def read_geojson(path: str, file_name: str):
  """ Function to read a geojson file 
    
  Arguments: 
    path {str} -- address to directory to read from
    file_name {str} -- name of file to read
 
  Returns: 
    df -- GeoDataFrame as read from geojson file
  """
    
  file = open(os.path.join(path, file_name))
  df = gpd.read_file(file)
  df = df.set_crs(epsg=4326)
  
  return df


In [9]:
def combine_tizi_ouzou(ta_og, ta_p, internal_combi, urba_gov, top_field='NSR', 
                      combi_keepers=['CUSTOMER_COD', 'sum(NSR)', 'sum(GSR)', 'sum(UNIT_CASES)', 'sum(PHYSICAL_CASES)', 'sum(TRANSACTIONS)']):
  
  """ Function to combine internal (cmd + sales) and trade area data
    
  Arguments:
    ta_og {DataFrame} -- trade area dataset prior to aggregations (contains extra fields fields)
    ta_p {DataFrame} -- prepped trade area dataset - post aggregations 
    urba_gov {DataFrame} -- prepped government territory urbanicity dataset 
    top_field {str} -- name of field to use for sorting (defaults to 'sum(NSR)')
    combi_keepers {list} -- names of columns to keep from internal_combi data (defaults to id var + aggregated value cols)
    
  Returns:
    tizi_ouzou_combi -- DataFrame with combined data from the internal and trade area data. 
  """
  # need to describe
  
  # 1) Helper for defining top rank name
  top_field_short = top_field + '_rank'
  
  # 2) Add territory ids to aggregated trade area set (need the territory id from set pre-aggregation)
  tizi_ouzou_ta = \
    pd.merge(
      ta_og[['CUSTOMER_COD', 'ADM2_EN']],
      ta_p,
      how = 'left',
      on = 'CUSTOMER_COD'
    )
  
  # 3) Add sales aggregates for each customer -> keep obs number as in trade area set (thus the left join to ta)
  tizi_ouzou_combi = \
    pd.merge(
      tizi_ouzou_ta,
      internal_combi[combi_keepers],
      how='left',
      on='CUSTOMER_COD'
    ). \
    sort_values(by=top_field, ascending=False)
  
  # 4) Add the new top rank col to use for selection within groups
  tizi_ouzou_combi[top_field_short] = top_rank=[x+1 for x in list(range(0, len(tizi_ouzou_combi)))]
 
 
  
  # 7) Add urbanicity for government territories
  urba_gov.columns = ['urba_gov_'+col if col != 'Commune' else col for col in urba_gov.columns]
  tizi_ouzou_combi = pd.merge(tizi_ouzou_combi, urba_gov, how='left', left_on='ADM2_EN', right_on='Commune')
  
  return tizi_ouzou_combi


In [None]:
def construct_selector(data, group_var='ADM2_EN', equal_brackets=0, gold_portion=0.1 ,bronze_portion=0.1, sample_size=200):
  """ Function to define the number of customer ids to be drawn from each administrative area bracket
 
    Arguments:
      data {DataFrame} -- input subset of data to work on
      group_var {str} -- name of variable to group by
      equal_brackets {int} -- indicates whether brackets should have the same size (1) or be proportionate to group size (0 -> default value)
      gold_portion {float} -- indicates the portion of gold tier customers to take, value between 0 and 1 (0.15 -> default value)
      sample_size {int} -- indicates what sample size to run for (defaults to 200)
    
    Returns:
      select_number -- Series with selector to use in forming a subset. 
    """
  
  # 1) Build a dictionary of admin area names and number of ids to draw
  # 1.1) keys 
  keys = list(data[group_var].value_counts().index) 
  
  # 1.2) values
  if equal_brackets==1:
    print("Equal number of items per group!")
    values = [round(sample_size/50)] * 50
  else: 
    print("Number of items proportionate to group size!")
    values = [round(x) for x in list(data[group_var].value_counts() / len(data) * sample_size)]
  
  # 2) Build selector series 
  select_number = pd.DataFrame({'number_to_select': dict(zip(keys, values))}). \
    assign(
      n_gold=lambda x: [round(y*gold_portion) for y in x['number_to_select']]
    )
  select_number['n_bronze'] = select_number['number_to_select']*bronze_portion
  select_number['n_silver'] = select_number['number_to_select'] - select_number['n_bronze']-select_number['n_gold']
  
  return select_number


In [10]:
def form_subset(data, group_var='ADM2_EN', equal_brackets=0, gold_portion=0.15,bronze_portion=0.1, top_field='NSR', sample_size=200):
  """ Function to form a subset given a custom selector
 
    Arguments:
      data {DataFrame} -- input subset of data to work on
      group_var {str} -- name of variable to group by
      equal_brackets {int} -- indicates whether brackets should have the same size (1) or be proportionate to group size (0 -> default value)
      top_field {str} -- name of field to draw the smallest (top) values from (defaults to 'sum(NSR)')
      gold_portion {float} -- indicates the portion of gold tier customers to take, value between 0 and 1 (0.75 -> default value)
      sample_size {int} -- indicates what sample size to run for (defaults to 200)
 
    Returns:
      output -- DataFrame subset derived via a pre-defined customer id selector. 
    """  
  
  # 1) Helper for defining top rank name
  top_field_short = top_field + '_rank'
  
  # 2) Build selector
  select_number = construct_selector(data, group_var, equal_brackets, gold_portion, sample_size)

  select_gold =  round(select_number['n_gold']).astype(int)
  select_silver =  round(select_number['n_silver']).astype(int)
  select_bronze =  round(select_number['n_bronze']).astype(int)
  gold=lambda dfg: dfg.nsmallest(select_gold[dfg.name], columns=top_field_short)
  silver=lambda dfg: dfg.nsmallest(select_silver[dfg.name], columns=top_field_short)
  bronze=lambda dfg: dfg.nsmallest(select_bronze[dfg.name], columns=top_field_short)


  # 3) Form subset
  TIER= "Classification client"
  # 3.1) gold set
  gold = data.loc[data[TIER]=='Gold'].groupby(group_var).apply(gold).reset_index(drop =True)
  # 3.2) silver set  
  silver = data.loc[data[TIER] == 'Silver'].groupby(group_var).apply(silver).reset_index(drop = True)
  output=gold.append(silver)
  # 3.3) bronze set  
  bronze = data.loc[data[TIER] == 'Bronze'].groupby(group_var).apply(bronze).reset_index(drop = True)
  

  return output


In [1]:
def prep_urba_gov(urba_gov):
  """ Function to prepare the raw data on urban/rural government defined provinces 
 
  Arguments: 
    urba_gov {DataFrame} -- raw urba_gov data
    
  Returns: 
    urba_province -- DataFrame for the government urbanicity post prep and formatting
  """     
  
  # 1) Specify province dictionary

  communes= [
      'Azeffoun', 'Iflissen', 'Mizrana', 'Tigzirt', 'Ait Chaffaa',
       'Aghrib', 'Zekri', 'Timizart', 'Akerrou', 'Makouda', 'Boudjima',
       'Freha', 'Sidi Naamane', 'Azazga', 'Ouaguenoun', 'Yakouren',
       'Ait Aissa Mimoun', 'Tadmait', 'Tizi Ouzou', 'Draa Ben Khedda',
       'Mekla', 'Souamaa', 'Ait Khelili', 'Idjeur', 'Tizi Rached',
       'Tirmitine', 'Irdjen', 'Ifigha', 'Beni Aissi', 'Ait Yahia Moussa',
       'Ait Oumalou', 'Larba Nait Irathen', 'Beni Zmenzer', 'Beni Douala',
       "M'Kira", 'Bouzguen', 'Ait Aggouacha', 'Maatka', 'Souk El Tenine',
       'Ait Yahia', 'Ait Mahmoud', 'Ain Zaouia', 'Iloula Oumalou',
       'Tizi Ghenif', 'Imsouhal', 'Draa El Mizan', 'Beni Yenni',
       'Ain El Hammam', 'Ouadhia', "Tizi N'Thleta", 'Beni Ziki',
       'Mechtrass', 'Iferhounene', 'Boghni', 'Ait Toudert', 'Abi Youcef',
       'Iboudraren', 'Ouacif', 'Yatafen', 'Ait Bouadou', 'Aghni Goughran',
       'Akbil', 'Illilten', 'Assi Youcef', 'Frikat', 'Bounouh',
       'Ait Boumahdi'
  ]


 
  
  # 2) Format dataset
  urba_commune = urba_gov.loc[ urba_gov.Commune.isin(list(Communes))]
 
  
  return(urba_commune)