In [2]:
import numpy as np 
from datetime import datetime
import pandas as pd 
import math

def redistr_weights(series, lb, ub):
    """
    Parameters
    ----------
    series : pd.Series
        series of weights.
    lb : float
        lower cutoff == min weight per security.
    ub : float 
        upper cutoff == max weight per security. 
    Returns
    -------
    redistr_series : pd.Series
        pd.Series with weights redistributed, capped by lb and ub.
    """
    redistr_series = series.round(25).copy()
    if len(series) > 22:
       
        redistr_series = np.clip(redistr_series, a_min = lb, a_max = ub)
    
        gap = np.subtract(1, redistr_series.sum())
    
        while redistr_series.max() > ub and redistr_series.min() < lb or not math.isclose(gap, 0, rel_tol = 0):
        
            redistr_series = np.clip(redistr_series, a_min = lb, a_max = ub)
        
            idx_okay = redistr_series[(redistr_series <= ub) & (redistr_series >= lb)].index 

            gap = max(0, np.subtract(1, redistr_series.sum()))
        
            redistrib = np.multiply(gap, np.divide(redistr_series[idx_okay], redistr_series[idx_okay].sum()))
        
            redistr_series = np.add(redistr_series, redistrib.reindex(redistr_series.index, fill_value = 0))
    else:
        redistrib = np.array([1/len(series)]*len(series))
        redistr_series = pd.Series(redistrib, index=series.index)
        
    return redistr_series

def equal_weight_by_type(unique_date, final_df):
    weight_list = []
    for i in unique_date:
        df = final_df[final_df['Snapshot Date'] == i]
        pure_play_index = df.loc[df['Type'] == 'Pure-Play'].index.tolist()
        n1 = len(pure_play_index)
        div_index = df.loc[df['Type'] == 'Diversified'].index.tolist()
        n2 = len(div_index)
        df.loc[pure_play_index, 'weight_by_type'] = 0.8 / n1
        df.loc[div_index, 'weight_by_type'] = 0.2 / n2
        print(sum(df.loc[:, 'weight_by_type'].values.tolist()))
        weight_list += df.loc[:, 'weight_by_type'].values.tolist()
    return weight_list


def equal_weight(unique_date, final_df):
    
    weight_list = []
    for i in unique_date:
        print(i)
        df = final_df[final_df['Snapshot Date'] == i]
        n = len(df)
        print(n)
        weight_i = [1 / n] * n
        weight_list += weight_i
    return weight_list


def mcap_weight(unique_date, final_df, ub, lb): # no longer used
    """
    This method aims to output the MCAP and redistributed weight for each snapshot date
    and for each index.
    """
    #lb = 0.0, ub = 0.05
    final_df['mcap_wts'] = final_df.groupby('Snapshot Date')['MVC (Missing Filled with MV)'].apply(lambda x: x/x.sum())   #calculates market-cap weights
    final_df['mcap_wts_redistr'] = final_df.groupby('Snapshot Date')['mcap_wts'].apply(lambda x: redistr_weights(x, lb, ub))
    
    output = final_df['mcap_wts_redistr'].values
    return output


def screening(input_df, index_name, exp_cf, mcap_min, fmcap_min, adtv_min, float_val):
    # True if >= exp_cf
    # True if >= mcap_min & >= fmcap_min & >= adtv_min & >= float_val & price wieght >= 0
    df_boolean = pd.DataFrame(input_df.loc[:, index_name]).applymap(lambda x: True if x >= exp_cf else False)
    boolean2 = (input_df['MVC (Missing Filled with MV)'] >= mcap_min) & (input_df['Float MCAP_snap'] >= fmcap_min) & (input_df['E006'] >= adtv_min) & (input_df['NOSHFF'] >= float_val) & (input_df['Price on Snap'] >= 0)
    list_false = boolean2[boolean2 == False].index.tolist()
    df_boolean.iloc[list_false, :] = [False] * df_boolean.shape[1]
    return df_boolean


def get_exp_true(index_name, final_TF_table, input_table):
    i = final_TF_table[final_TF_table[index_name] == True].index.tolist()
    row = input_table.loc[i, ['Snapshot Date', index_name, 'Final ID', 'E006', 'MVC (Missing Filled with MV)', 'Float MCAP_snap',
                              index_name[:-13]+" (Primary Group)"]]
    row.insert(loc=0, column='Index', value=[index_name[:-13]] * len(row))
    return row


# index_name is the index name plus explosure, rank_method 1, 2, 3 or 4 stands for different methods, pure_play True/False
# group_rank True/False, diversified_group indiates the group set to Diversified
def constituent_cap_v1(tf_df, input_df, index_name, rank_method, pure_play, diversified_group, group_rank):
    '''change 50, 20 as varible exp_cutoff'''
    
    df = get_exp_true(index_name, tf_df, input_df)
    
    df.loc[:, 'Pure_play'] = df[index_name] >= 50
    df.loc[:, 'Diversified'] = (df[index_name] < 50) & (df[index_name] >= 20)
    
    df.loc[((df[index_name[:-13] + ' (Primary Group)'] == diversified_group) & (df[index_name] >= 20)) | (df[index_name] == 100000), 'Pure_play'] = False
    df.loc[((df[index_name[:-13] + ' (Primary Group)'] == diversified_group) & (df[index_name] >= 20)) | (df[index_name] == 100000), 'Diversified'] = True
      
    if pure_play: 
        df_sub = df[df['Pure_play'] == True]
        df_sub.loc[:, "Type"] = len(df_sub) * ['Pure-Play']
        print('The number of eligible pure-play stocks:', len(df_sub))
    
    else:  
        df_sub = df[df['Diversified'] == True]
        print('The number of eligible diversified stocks:', len(df_sub))
        df_sub.loc[:, "Type"] = len(df_sub) * ['Diversified']
        
    # obtain the ranks based on exposure, adtv, mvc
    df_sub.loc[:, "Exp_rank"] = df_sub.groupby("Snapshot Date")[index_name].rank("min", ascending=False).astype(int)
    df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
    df_sub.loc[:, "MVC_rank"] = df_sub.groupby("Snapshot Date")["MVC (Missing Filled with MV)"].rank("first", 
                                                                                              ascending=False).astype(int)

    # obtain the group ranks based on exposure, adtv, mvc
    df_sub.loc[:, "Exp_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                              ])[index_name].rank("min", ascending=False).astype(int)
    df_sub.loc[:, "ADTV_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                               ])["E006"].rank("first", ascending=False).astype(int)
    df_sub.loc[:, "MVC_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                              ])["MVC (Missing Filled with MV)"].rank("first", ascending=False).astype(int)

    # obtain average rank as the average of exp, adtv, mvc ranks
    if rank_method == 1:
        df_sub.loc[:, 'rank_avg'] = df_sub[["ADTV_rank", "MVC_rank"]].mean(axis=1)
    elif rank_method == 2:
        df_sub.loc[:, 'rank_avg'] = df_sub[["Exp_rank", "ADTV_rank", "MVC_rank"]].mean(axis=1)
    elif rank_method == 3:  # obtian the average rank by group
        df_sub.loc[:, 'rank_avg_group'] = df_sub[["ADTV_rank_group", "MVC_rank_group"]].mean(axis=1)
    elif rank_method == 4:
        df_sub.loc[:, 'rank_avg_group'] = df_sub[["Exp_rank_group", "ADTV_rank_group", "MVC_rank_group"]].mean(axis=1)
    else:
        df_sub.loc[:, 'rank_avg'] = 1
    
    if group_rank:
        df_sub.sort_values(['Snapshot Date', 'rank_avg_group', 'ADTV_rank_group'], ascending=[True, True, True], inplace=True)
    else:
        df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascending=[True, True, True], inplace=True)
    
    df_sub['rank_final'] = 1
    
    if group_rank:
        df_sub['rank_final'] = df_sub.groupby(['Snapshot Date', index_name[:-13] + " (Primary Group)"])['rank_final'].cumsum()
    else:
        df_sub['rank_final'] = df_sub.groupby(['Snapshot Date'])['rank_final'].cumsum()
    
    df_sub.reset_index(drop=True, inplace=True)
    df_sub.drop(['Pure_play', 'Diversified'], axis=1, inplace=True)

    return df_sub


def constituent_cap(tf_df, input_df, index_name, rank_method, pure_play, diversified_group, group_rank):
    '''change 50, 20 as varible exp_cutoff'''
    
    df = get_exp_true(index_name, tf_df, input_df)
    
    df['Type'] = np.where(df[index_name] >= 50, 'Pure-Play', 'Diversified')
    
    if diversified_group != 11111111:
        df['Type_Initial'] = np.where(df[index_name] >= 50, 'Pure-Play', 'Diversified')
        df.loc[((df[index_name[:-13] + ' (Primary Group)'] == diversified_group) & (df[index_name] >= 20)), 'Type'] = 'Diversified'
        
    if pure_play: 
        df_sub = df[df['Type'] == 'Pure-Play']
        print('The number of eligible pure-play stocks:', len(df_sub))
    
    else:  
        df_sub = df[df['Type'] == 'Diversified']
        print('The number of eligible diversified stocks:', len(df_sub))
        
    # obtain the ranks based on exposure, adtv, mvc
    
    df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
    
    #df_sub.loc[:, "Exp_rank"] = df_sub.groupby("Snapshot Date")[index_name].rank("min", ascending=False).astype(int)
    df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
    df_sub.loc[:, "MVC_rank"] = df_sub.groupby("Snapshot Date")["MVC (Missing Filled with MV)"].rank("first", 
                                                                                              ascending=False).astype(int)

    # obtain the group ranks based on exposure, adtv, mvc
    
    
    #df_sub.loc[:, "Exp_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                            #  ])[index_name].rank("min", ascending=False).astype(int)
        
    df_sub.loc[:, "Exp_rank_group"] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date', index_name[:-13] + " (Primary Group)"]).cumcount(ascending=False)+1)
    df_sub.loc[:, "ADTV_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                               ])["E006"].rank("first", ascending=False).astype(int)
    df_sub.loc[:, "MVC_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                              ])["MVC (Missing Filled with MV)"].rank("first", ascending=False).astype(int)

    # obtain average rank as the average of exp, adtv, mvc ranks
    if rank_method == 1:
        df_sub.loc[:, 'rank_avg'] = df_sub[["ADTV_rank", "MVC_rank"]].mean(axis=1)
    elif rank_method == 2:
        df_sub.loc[:, 'rank_avg'] = df_sub[["Exp_rank", "ADTV_rank", "MVC_rank"]].mean(axis=1)
    elif rank_method == 3:  # obtian the average rank by group
        df_sub.loc[:, 'rank_avg_group'] = df_sub[["ADTV_rank_group", "MVC_rank_group"]].mean(axis=1)
    elif rank_method == 4:
        df_sub.loc[:, 'rank_avg_group'] = df_sub[["Exp_rank_group", "ADTV_rank_group", "MVC_rank_group"]].mean(axis=1)
    else:
        df_sub.loc[:, 'rank_avg'] = 1
    
    if group_rank:
        df_sub.sort_values(['Snapshot Date', 'rank_avg_group', 'ADTV_rank_group'], ascending=[True, True, True], inplace=True)
    else:
        df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascending=[True, True, True], inplace=True)
    
    df_sub['rank_final'] = 1
    
    if group_rank:
        df_sub['rank_final'] = df_sub.groupby(['Snapshot Date', index_name[:-13] + " (Primary Group)"])['rank_final'].cumsum()
    else:
        df_sub['rank_final'] = df_sub.groupby(['Snapshot Date'])['rank_final'].cumsum()
    
    df_sub.reset_index(drop=True, inplace=True)

    return df_sub


def constituent_cap_v1(tf_df, input_df, index_name, rank_method, pure_play, diversified_group, group_rank):
    '''change 50, 20 as varible exp_cutoff'''
    
    df = get_exp_true(index_name, tf_df, input_df)
    
    df['Type'] = np.where(df[index_name] >= 50, 'Pure-Play', 'Diversified')
    
    if diversified_group != 11111111:
        df['Type_Initial'] = np.where(df[index_name] >= 50, 'Pure-Play', 'Diversified')
        df.loc[((df[index_name[:-13] + ' (Primary Group)'] == diversified_group) & (df[index_name] >= 20)), 'Type'] = 'Diversified'
        
    if pure_play: 
        df_sub = df[df['Type'] == 'Pure-Play']
        print('The number of eligible pure-play stocks:', len(df_sub))
    
    else:  
        df_sub = df[df['Type'] == 'Diversified']
        print('The number of eligible diversified stocks:', len(df_sub))
        
    # obtain the ranks based on exposure, adtv, mvc
    
    df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
    
    #df_sub.loc[:, "Exp_rank"] = df_sub.groupby("Snapshot Date")[index_name].rank("min", ascending=False).astype(int)
    df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
    df_sub.loc[:, "MVC_rank"] = df_sub.groupby("Snapshot Date")["MVC (Missing Filled with MV)"].rank("first", 
                                                                                              ascending=False).astype(int)

    # obtain the group ranks based on exposure, adtv, mvc
    df_sub.loc[:, "Exp_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                              ])[index_name].rank("min", ascending=False).astype(int)
    df_sub.loc[:, "ADTV_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                               ])["E006"].rank("first", ascending=False).astype(int)
    df_sub.loc[:, "MVC_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                              ])["MVC (Missing Filled with MV)"].rank("first", ascending=False).astype(int)

    # obtain average rank as the average of exp, adtv, mvc ranks
    if rank_method == 1:
        df_sub.loc[:, 'rank_avg'] = df_sub[["ADTV_rank", "MVC_rank"]].mean(axis=1)
    elif rank_method == 2:
        df_sub.loc[:, 'rank_avg'] = df_sub[["Exp_rank", "ADTV_rank", "MVC_rank"]].mean(axis=1)
    elif rank_method == 3:  # obtian the average rank by group
        df_sub.loc[:, 'rank_avg_group'] = df_sub[["ADTV_rank_group", "MVC_rank_group"]].mean(axis=1)
    elif rank_method == 4:
        df_sub.loc[:, 'rank_avg_group'] = df_sub[["Exp_rank_group", "ADTV_rank_group", "MVC_rank_group"]].mean(axis=1)
    else:
        df_sub.loc[:, 'rank_avg'] = 1
    
    if group_rank:
        df_sub.sort_values(['Snapshot Date', 'rank_avg_group', 'ADTV_rank_group'], ascending=[True, True, True], inplace=True)
    else:
        df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascending=[True, True, True], inplace=True)
    
    df_sub['rank_final'] = 1
    
    if group_rank:
        df_sub['rank_final'] = df_sub.groupby(['Snapshot Date', index_name[:-13] + " (Primary Group)"])['rank_final'].cumsum()
    else:
        df_sub['rank_final'] = df_sub.groupby(['Snapshot Date'])['rank_final'].cumsum()
    
    df_sub.reset_index(drop=True, inplace=True)

    return df_sub


##### if the number of pure play is less than 50 ######
def include_diversified(eligible, index_name, unique_date, target_count, rank_method, input_df, pure_play_df, diversified_group, group_rank):
    div_dict = dict()
    div_df_list = [pure_play_df]
    for i in unique_date:
        pure_play_num = len(pure_play_df[pure_play_df['Snapshot Date'] == i])
        if pure_play_num < target_count:
            diversified_num = target_count - pure_play_num
            div_dict[i] = diversified_num
    if div_dict != {}:
        diversified_df = constituent_cap(eligible, input_df, index_name, rank_method, False, diversified_group, group_rank)
        for date, num in div_dict.items():
            diversified_sub = diversified_df[diversified_df['Snapshot Date'] == date
                                            ].sort_values('rank_final',ascending = True).head(num)
            div_df_list.append(diversified_sub)
        return pd.concat(div_df_list, axis=0)
    
    else:
        return pure_play_df
    
    
# usually exp_cf1 is 50 and exp_cf2 is 20                                    
def final_select_method(target_count, index_name, input_df, exp_cf1, exp_cf2, mcap_min, fmcap_min, adtv_min, float_val, 
                 rank_method, pure_play, diversified_group, group_rank, weight_method, lb, ub, exclude_group):
    
    if exclude_group != 0:
        input_df.loc[input_df[index_name[:-13] + " (Primary Group)"] == exclude_group, index_name] = 0
    
    # Eligible for pure-play
    eligible = screening(input_df, index_name, exp_cf1, mcap_min, fmcap_min, adtv_min, float_val)
    pure_play_df = constituent_cap(eligible, input_df, index_name, rank_method, True, diversified_group, group_rank)

    if exp_cf2 < exp_cf1:# Eligible for diversified
        print('========================Eligible for diversified==========================')
        eligible_div = screening(input_df, index_name, exp_cf2,  mcap_min, fmcap_min, adtv_min, float_val)
        final_df = include_diversified(eligible_div, index_name, unique_date, target_ct, rank_method, input_df, pure_play_df, 
                                   diversified_group, group_rank)
        
        # Select top 5/10/50/100 stocks based on the target count
        final_select = final_df.sort_values('rank_final',ascending = True).groupby('Snapshot Date').head(target_ct)
    else:
        if rank_method > 2:
            final_select = pure_play_df.sort_values('rank_final', ascending = True).groupby(['Snapshot Date', 
                                                            index_name[:-13] + " (Primary Group)"]).head(target_ct)
        else:
            print("++++++++++Pure Play+++++++++")
            final_select = pure_play_df.sort_values('rank_final', ascending = True).groupby('Snapshot Date').head(target_ct)
    print(final_select.columns)
    
    ######### convert to the original type #######
    if diversified_group != 11111111:
        #print(diversified_group, diversified_group == 11111111, index_name, "=====================")
        final_select.drop('Type', axis=1, inplace=True)
        final_select.rename({'Type_Initial':'Type'}, axis=1, inplace=True)
    
    
    ### Weighting ###
    final_select.sort_values(by=['Snapshot Date', 'Type', 'rank_final'], ascending=[True, False, True], inplace = True)
    final_select.reset_index(drop=True, inplace=True)
    
    if weight_method == 'equal weight': # equal weight
        final_select['equal weight'] = equal_weight(unique_date, final_select)
        
    elif weight_method == 'equal weight by type': # equal weight with pure-play 80% and diversified 20%
        final_select['equal weight by type'] = equal_weight_by_type(unique_date, final_select)
        
    elif weight_method == 'mcap weight': # market captilization redistributed  weight_method == 'market cap'
        final_select['mcap_weight'] = final_select.groupby('Snapshot Date')['MVC (Missing Filled with MV)'].apply(lambda x: x/x.sum())   #calculates market-cap weights
        final_select['mcap_wts_redistr'] = final_select.groupby('Snapshot Date')['mcap_wts'].apply(lambda x: redistr_weights(x, lb, ub))
    
    elif weight_method == 'mcap weight by type':
        final_select['mcap_weight_type'] = final_select.groupby(['Snapshot Date', 'Type'])[['MVC (Missing Filled with MV)',
                                                        'Type']].apply(lambda x: mcap_by_type(x).to_frame('mcap_weight_type'))
        final_select['mcap_wts_type_redistr'] = final_select.groupby('Snapshot Date')['mcap_weight_type'].apply(lambda x: redistr_weights(x, lb, ub))
        
    elif weight_method == 'fmcap weight':
        final_select['fmcap_weight'] = final_select.groupby('Snapshot Date')['Float MCAP'].apply(lambda x: x/x.sum())
        final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, lb, ub))
    
    elif weight_method == 'exp weight':
        final_select['exp weight'] = final_select.groupby(['Snapshot Date'])[index_name].apply(lambda x: EW_exp(x))
        
    elif weight_method == 'exp weight by type':
        final_select['exp weight by type'] = final_select.groupby(['Snapshot Date', 'Type'])[[index_name, 'Type']].apply(
            lambda x: EW_exp_type(x).to_frame('exp_weight_type'))
        
    elif weight_method == 'exp mcap weight':
        final_select['exp_mcap_wt'] = final_select.groupby(['Snapshot Date'])[[index_name, 'MVC (Missing Filled with MV)']].apply(lambda x: EW_exp_mcap(x))
        final_select['exp_mcap_redistr'] = final_select.groupby('Snapshot Date')['exp_mcap_wt'].apply(lambda x: redistr_weights(x, lb, ub))
    
    elif weight_method == 'exp mcap weight by type':
        final_select['exp_mcap_type'] = final_select.groupby(['Snapshot Date', 'Type'])[[index_name, 'MVC (Missing Filled with MV)', 'Type']].apply(
            lambda x: EW_exp_mcap_type(x).to_frame('exp_mcap_type'))
        final_select['exp_mcap_type_redistr'] = final_select.groupby('Snapshot Date')['exp_mcap_type'].apply(lambda x: redistr_weights(x, lb, ub))
    
    else:
        print('not weighting')
    
    ### change the exposure type back to the original one
    #if diversified_group != 0:
    display(final_select.head())
    return final_select


def defensive_pg(df_sub, group_number, unique_date, index_name):
    df_38086 = df_sub.copy()
    df_38086.loc[df_38086['Group'] != group_number, 'Group'] = 0
    df_38086.loc[df_38086['Group'] != group_number, index_name] = 0

    print(len(df_38086[df_38086['Group'] != 0]), len(df_38086[df_38086[index_name] > 0]))

    df_clean = df_38086[df_38086['Group'] != 0]
    print(len(df_clean))

    unique_date = df_clean['Snapshot Date'].unique().tolist()
    
    a = equal_weight(unique_date, df_clean)

    print(len(a), len(unique_date))

    df_clean.loc[:, 'new_weight'] = a

    df_clean2 = df_clean[['Snapshot Date', 'Final ID', 'new_weight', 'Exposure', 'Group']]
    return df_clean2

    
def merge_all(df1, df2):
    return df1.merge(df2, how='left', on=['Snapshot Date','Final ID'])


def get_rbr_name(rbr_list, df3, index_name):
    level = df3.loc[df3['Taxonomy Name'] == index_name, 'Taxonomy Level Used'].values.tolist()
    a = get_rbr_id_map(df1, rbr_list, level)
    b = pd.DataFrame(a.items(), columns=['id', 'name'])
    b.to_excel('C:\\Users\\rzhou\\Downloads\\historical baskets\\rbr_name\\'+ index_name +'.xlsx')

    
def get_rbr_id_map(df_rbr, taxonomy_ls, level_ls):
    """Obtain the dictionary of key as rbr id and value as rbr name"""
    sub_df = df_rbr.loc[(df_rbr.rbr_id.isin(taxonomy_ls)) & df_rbr.level.isin(level_ls)]
    rbr_id_map = dict(zip(sub_df.rbr_id.apply(str), sub_df.name))
    return rbr_id_map


def avg_constituent(iteration, unique_date, final_select, file_name): 
    """Obtain average constituent count"""
    print("The unique group numbers are:", iteration)
    n = len(unique_date)
    composite = final_select.groupby('Snapshot Date')['Group'].size().values
    composite_avg = composite[1:].sum()/n
    print(composite_avg)
    data = [['Composite', composite_avg]]

    for i in iteration: # NEED TO CHANGE INDEX NAME
        sub = final_select[final_select['Group'] == i]
        x = sub.groupby('Snapshot Date')['Group'].count().values
        b = x[1:]
        data.append([i, b.sum()/n])
        print(len(sub), i, b.sum()/n)
        
    turnover = pd.DataFrame(data, columns=['Group', 'Average_Count'])
    #print(turnover)
    turnover.to_excel('C:\\Users\\rzhou\\Downloads\\historical baskets\\avg_constituent_cap\\turnover ' + file_name 
                      + '.xlsx')


def no_constituent_cap_EW(target_ct, index_name, input_df, exp1, exp2):
    index_exp = index_name + ' (Exposure %)'
    index_gp = index_name + ' (Primary Group)'
    a = final_select_method(target_ct, index_exp, input_df, 
             exp1, exp2, 0, 0, 0, 0, 0, True, 11111111, False, 'equal weight', 0, 0, 0)
    a2 = a[['Final ID', 'Snapshot Date', 'Type', index_gp, 'equal_weight']].rename({'Type':'Exposure', 
                            index_gp:'Group', 'equal_weight':index_name+' Index 1'}, axis=1, inplace=False)
    print(a2.shape)
    return a2


def no_constituent_No_weight(target_ct, index_name, input_df, exp1, exp2):
    index_exp = index_name + ' (Exposure %)'
    index_gp = index_name + ' (Primary Group)'
    if exp2 < exp2:
        a = final_select_method(target_ct, index_exp, input_df, 
             exp1, exp2, 0, 0, 0, 0, 0, False, 11111111, False, 'no weight', 0.0, 0.0, 0)
    else:
        a = final_select_method(target_ct, index_exp, input_df, 
             exp1, exp2, 0, 0, 0, 0, 0, True, 11111111, False, 'no weight', 0.0, 0.0, 0)
    a2 = a[['Final ID', 'Snapshot Date', 'Type', index_gp]].rename({'Type':'Exposure', 
                            index_gp:'Group'}, axis=1, inplace=False)
    #print(a2.groupby('Snapshot Date')[index_name+'_'+str(exp2)].sum())
    print(a2.shape)
    return a2


def EW_exp(series): # input: Group by snapshot date
    exp = series.values
    exp[exp == 100000] = 20
    output = exp / 100 
    output_series = pd.Series(output / output.sum(), index=series.index)
    return output_series

def EW_exp_type(series):
    exp_type = series['Type'].unique().tolist()[0]
    exp = series[index_name].values
    exp[exp == 100000] = 20
    exp_pct = exp / 100
    ######
    if exp_type == 'Pure-Play':
        output = 0.8 / exp_pct.sum() * exp_pct
    else:
        output = 0.2 / exp_pct.sum() * exp_pct
        
    #output_series = pd.Series(output).T.reset_index()
    #output_series2 = output_series.drop('index', axis=1)
    output_series2 = pd.Series(output, index=series.index)
    return output_series2

def EW_exp_mcap(series): # input: Group by snapshot date and type
    exp = series[index_name].values
    mcap = series['MVC (Missing Filled with MV)'].values
    exp[exp == 100000] = 20
    mcap_wt = mcap / mcap.sum()
    exp_wt = exp / exp.sum()
    
    output = mcap_wt * exp_wt
    
    output_series = pd.Series(output / output.sum()).T.reset_index()
    output_series2 = output_series.drop('index', axis=1)
    return output_series2

def EW_exp_mcap_type(series): # input: Group by snapshot date and type
    exp = series[index_name].values
    mcap = series['MVC (Missing Filled with MV)'].values
    exp[exp == 100000] = 20
    mcap_wt = mcap / mcap.sum()
    exp_wt = exp / exp.sum()
    exp_type = series['Type'].unique().tolist()[0]
    
    output = mcap_wt * exp_wt
    
    if exp_type == 'Pure-Play':
        output_series = pd.Series(0.8 * output / output.sum(), index=series.index)
    else:
        output_series = pd.Series(0.2 * output / output.sum(), index=series.index)
    
    #output_series2 = output_series.drop('index', axis=1)
    return output_series

def mcap_by_type(series): # input: Group by snapshot date and type
    exp_type = series['Type'].unique().tolist()[0]
    mcap = series['MVC (Missing Filled with MV)'].values
    
    if exp_type == 'Pure-Play':
        output = 0.8 / mcap.sum() * mcap 
    else:
        output = 0.2 / mcap.sum() * mcap 
        
    output_series = pd.Series(output, index=series.index)
    return output_series
    
    
##### INPUT FILE ######    
# input_df = pd.read_excel('C:\\Users\\rzhou\\Downloads\\March13_Thematic_Data\\exp_group_March13_final3.xlsx') # Universe Data
#input_df = pd.read_excel('C:\\Users\\rzhou\\Downloads\\2006_Defensive\\data_filled_exp_code.xlsx') # Universe Data
input_df = pd.read_excel('3000_check_june_23._rebalance_levels_with_bases.xlsx')

#df1 = pd.read_excel('C:\\Users\\rzhou\\Downloads\\Project_1\\rbr_id_map_2022-10-02_no_filter.xlsx') # Uncomment it for individual primary group
#df3 = pd.read_excel('C:\\Users\\rzhou\\Downloads\\Project_1\\taxonomy_name_level2.xlsx')
#input_df.loc[input_df['Digital Health (Primary Group)'] == 44257, 'Digital Health (Exposure %)'] = 0
#input_df.loc[input_df['Inflation (Primary Group)'] == 42903, 'Inflation (Exposure %)'] = 0

# Convert 100000 to 20

input_df.replace([100000], 20, inplace=True)

# Convert datetime to string
input_df['Snapshot Date'] = input_df['Snapshot Date'].astype(str) #####
input_df['Snapshot Date'] = pd.to_datetime(input_df['Snapshot Date']) #####
input_df['Snapshot Date'] = input_df['Snapshot Date'].dt.strftime('%Y-%m-%d') 

# Truncate exposure to 10 decimal points
#input_df.loc[:, 'Cybersecurity (Exposure %)':'Digital Health (Exposure %)'] = input_df.loc[:, 
    #                                                        'Cybersecurity (Exposure %)':'Digital Health (Exposure %)'].round(10)

input_df.loc[:, 'Defensive (Exposure %)'] = input_df.loc[:, 
                                                            'Defensive (Exposure %)'].round(10)#####

# Obtain a list of unique snapshot date (in ascending order)
unique_date = input_df['Snapshot Date'].unique().tolist()

df2 = input_df[['Snapshot Date', 'Rebal Date',
       'Ex Date', 'Weight Date', 'OPID', 'DSCD', 'ISIN',
       'Final ID']]

df2.loc[:, 'Final ID'] =df2.loc[:, 'Final ID'].astype(str)
df2.loc[:, 'Snapshot Date'] = df2.loc[:, 'Snapshot Date'].astype(str)

index_list = ['Cybersecurity', 'Cloud', 'IoT', 'Battery', 'Bio Revolution',  'SaaS', 
              'Defensive',  'Real Asset', 'Digital Health', 'E-Commerce', 'Clean Energy',
              'Infrastructure', 'Inflation']
#index_list = ['Defensive']

index_name = ['Cybersecurity (Primary Group)', 'Cloud (Primary Group)',
       'IoT (Primary Group)', 'E-Commerce (Primary Group)',
       'Battery (Primary Group)', 'Bio Revolution (Primary Group)',
       'Clean Energy (Primary Group)', 'SaaS (Primary Group)',
       'Defensive (Primary Group)', 'Infrastructure (Primary Group)',
       'Real Asset (Primary Group)', 'Inflation (Primary Group)',
       'Digital Health (Primary Group)']
#index_name=['Defensive (Primary Group)']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.loc[:, 'Final ID'] =df2.loc[:, 'Final ID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.loc[:, 'Snapshot Date'] = df2.loc[:, 'Snapshot Date'].astype(str)


In [28]:
### TEST CAN BE IGNORED WHEN RUNNING THE CODE###
eligible = screening(input_df, index_name, exp_cf1, mcap_min, fmcap_min, adtv_min, float_val)
eligible2 = screening(input_df, index_name, exp_cf2, mcap_min, fmcap_min, adtv_min, float_val)

xx = constituent_cap(eligible, input_df, index_name, rank_method,True, diversified_group, group_rank)
xx2 = constituent_cap(eligible2, input_df, index_name, rank_method,False, diversified_group, group_rank)

xx_0831 = xx[xx['Snapshot Date'] == '2022-11-30']
xx2_0831 = xx2[xx2['Snapshot Date'] == '2022-11-30']
print(len(xx_0831), len(xx2_0831))

"""
boolean2 = input_df.loc[(input_df['MVC (Missing Filled with MV)'] >= mcap_min) & (input_df['Float MCAP_snap'] >= fmcap_min) & (input_df['E006'] >= adtv_min) 
                        & (input_df['NOSHFF'] >= float_val) & (input_df['Price on Snap'] >= 0)&(input_df[index_name]>=20), ['Snapshot Date', 'ISIN', index_name, 'Float MCAP_snap', 'Float MCAP']]

print(len(boolean2[boolean2['Snapshot Date'] == '2022-08-31']))#boolean2.groupby('Snapshot Date')['ISIN'].nunique()
xx2_0831['Final ID'].astype(str).isin(['5082179135', '5081376992', '5067946063', '5076698482'])

test = input_df.loc[(input_df[index_name]>=20) &(input_df['Float MCAP']>500) &(input_df['Float MCAP_snap']<500), ['ISIN', 'Snapshot Date', index_name, 'Float MCAP_snap', 'Float MCAP']]
xx2 = boolean2[boolean2['Snapshot Date'] == '2022-08-31']
df = get_exp_true(index_name, eligible, input_df)
xx2[xx2['Real Asset (Primary Group)'] == 46210].sort_values('ADTV_rank_group')
test = input_df.loc[(input_df[index_name]>=20) &(input_df['Price on Weight']>0) &(input_df['Price on Snap']<=0), ['ISIN', 'Snapshot Date', index_name, 'Price on Snap', 'Price on Weight']]
"""

The number of eligible pure-play stocks: 264
The number of eligible diversified stocks: 1052


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

In [4]:
"""13 THEMATIC INDEDICES"""
rule = pd.read_excel('C:\\Users\\rzhou\\OneDrive\\Documents\\RULES.xlsx') # import the rules of indices

# Obtain the list of exposure cutoff, mcap cutoff, etc.
index_name_ls = rule.columns[1:].tolist()
target_ct_ls = rule.iloc[0].values[1:].tolist()
exp_cf1_ls = rule.iloc[1].values[1:].tolist()
exp_cf2_ls = rule.iloc[2].values[1:].tolist()
mcap_min_ls = rule.iloc[3].values[1:].tolist()
fmcap_min_ls = rule.iloc[4].values[1:].tolist()
adtv_min_ls = rule.iloc[5].values[1:].tolist()
float_val_ls = rule.iloc[6].values[1:].tolist()
rank_method_ls = rule.iloc[7].values[1:].tolist()
diversified_group_ls = rule.iloc[8].values[1:].tolist()
group_rank_ls = rule.iloc[10].values[1:].tolist()
weight_method_ls = rule.iloc[11].values[1:].tolist()

exclude_group, lb, ub = 0, 0, 0
final_ls = []
for i in range(13):######## iterate thru the 13 thematic indices
   
    index_name = index_name_ls[i]
    index_gp = index_name[:-13] + ' (Primary Group)' 
    target_ct = target_ct_ls[i]
    print(target_ct)
    print(index_name, "+++++++++++++++++++++++++++++++++++")
    exp_cf1 = exp_cf1_ls[i]
    exp_cf2 = exp_cf2_ls[i]
    mcap_min = mcap_min_ls[i]
    fmcap_min = fmcap_min_ls[i]
    adtv_min = adtv_min_ls[i]
    float_val = float_val_ls[i]
    rank_method = rank_method_ls[i]
    diversified_group = diversified_group_ls[i]
    group_rank = group_rank_ls[i]
    weight_method = weight_method_ls[i]
    # implement the final_select_method to obtain the final selected companies/securities
    final_select_df = final_select_method(target_ct, index_name, input_df, exp_cf1, exp_cf2, mcap_min, fmcap_min, adtv_min, float_val, 
                 rank_method, True, diversified_group, group_rank, weight_method, lb, ub, exclude_group)
    
   # if diversified_group != 11111111:
        #print(diversified_group, diversified_group == 11111111, index_name, "=====================")
    #    a2 = final_select_df[['Final ID', 'Snapshot Date', 'Type_Initial', index_gp, weight_method]].rename({'Type_Initial':'Exposure', 
          #                                                                                           index_gp:'Group', weight_method:index_name[:-13]}, axis=1, inplace=False)
    #else:
        #print(diversified_group, diversified_group == 11111111, index_name, "=====================")
    a2 = final_select_df[['Final ID', 'Snapshot Date', 'Type', index_gp, weight_method]].rename({'Type':'Exposure', 
                                                                                                     index_gp:'Group', weight_method:index_name[:-13]}, axis=1, inplace=False)
    
    print(a2.groupby('Snapshot Date')[index_name[:-13]].sum())
    print(a2.shape)
    final_ls.append(a2)

    
df2['Snapshot Date'] = df2['Snapshot Date'].astype(str)
final_ls[0]['Snapshot Date'] =  final_ls[0]['Snapshot Date'].astype(str)

df2['Final ID'] = df2['Final ID'].astype(str)
final_ls[0]['Final ID'] = final_ls[0]['Final ID'].astype(str)

# Merge the 13 tables of final selection
cur = df2.merge(final_ls[0], how='left', on=['Snapshot Date','Final ID'])
#print(cur.shape)

for i in final_ls[1:]:
    temp = cur.copy()
    i['Snapshot Date'] = i['Snapshot Date'].astype(str)
    i['Final ID'] = i['Final ID'].astype(str)
    cur = merge_all(temp, i)
    print(cur.shape)

# Rename the columns, and fill NA by zero
cur.rename({'Snapshot Date':'snapshot date', 'Rebal Date':'ex date', 'Weight Date':'weight date', 'DSCD':'ticker', 
           'Ex Date':'open date'}, axis=1, inplace=True)
cur.fillna(0, inplace=True)
cur.to_excel('C:\\Users\\rzhou\\Downloads\\historical basket_thematic.xlsx')


50
Battery (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 18


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

The number of eligible diversified stocks: 30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "MVC_rank"] = df_sub.groupby("Snapshot Date")["MVC (Missing Filled with MV)"].rank("first",
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "Exp_rank_group"] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date', index_name[:-13] + " (Primary Group)"]).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas

Index(['Index', 'Snapshot Date', 'Battery (Exposure %)', 'Final ID', 'E006',
       'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'Battery (Primary Group)', 'Type', 'Type_Initial', 'Exp_rank',
       'ADTV_rank', 'MVC_rank', 'Exp_rank_group', 'ADTV_rank_group',
       'MVC_rank_group', 'rank_avg', 'rank_final'],
      dtype='object')
2023-02-28
48


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascending=[True, True, True], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = df_sub.groupby(['Snapshot Date'])['rank_final'].cumsum()


Unnamed: 0,Index,Snapshot Date,Battery (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,Battery (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight
0,Battery,2023-02-28,100.0,5037344406,481649.7,17896.21,10558.7639,41765,Pure-Play,1,2,3,1,1,1,2.0,1,0.020833
1,Battery,2023-02-28,97.819842,4297089638,27912750.0,650887.2,566271.864,41763,Pure-Play,8,1,2,8,1,2,3.666667,1,0.020833
2,Battery,2023-02-28,100.0,4295914572,245965.5,8823.77,7500.2045,41764,Pure-Play,2,4,4,1,1,1,3.333333,2,0.020833
3,Battery,2023-02-28,100.0,5076109427,307538.6,16712.01,6517.6839,41763,Pure-Play,1,7,10,1,6,8,6.0,2,0.020833
4,Battery,2023-02-28,100.0,5073625014,62178.48,6216.62,5159.7946,41761,Pure-Play,3,7,5,1,4,3,5.0,3,0.020833


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)


Snapshot Date
2023-02-28    1.0
Name: Battery, dtype: float64
(48, 5)
50
Bio Revolution (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "MVC_rank"] = df_sub.groupby("Snapshot Date")["MVC (Missing Filled with MV)"].rank("first",
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "Exp_rank_group"] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date', index_name[:-13] + " (Primary Group)"]).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas

The number of eligible diversified stocks: 42


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "Exp_rank_group"] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date', index_name[:-13] + " (Primary Group)"]).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas

Index(['Index', 'Snapshot Date', 'Bio Revolution (Exposure %)', 'Final ID',
       'E006', 'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'Bio Revolution (Primary Group)', 'Type', 'Type_Initial', 'Exp_rank',
       'ADTV_rank', 'MVC_rank', 'Exp_rank_group', 'ADTV_rank_group',
       'MVC_rank_group', 'rank_avg', 'rank_final'],
      dtype='object')
2023-02-28
50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = df_sub.groupby(['Snapshot Date'])['rank_final'].cumsum()


Unnamed: 0,Index,Snapshot Date,Bio Revolution (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,Bio Revolution (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight
0,Bio Revolution,2023-02-28,100.0,5037927972,45772.017,1143.45,1074.843,44213,Pure-Play,2,2,5,1,1,1,3.0,1,0.02
1,Bio Revolution,2023-02-28,83.289242,4295900447,261618.7954,31473.61,26437.8324,44215,Pure-Play,17,6,6,17,5,5,9.666667,1,0.02
2,Bio Revolution,2023-02-28,100.0,4295899267,27590.4149,2055.66,1788.4242,44210,Pure-Play,6,3,2,2,2,2,3.666667,2,0.02
3,Bio Revolution,2023-02-28,53.669754,5066581979,201901.6073,44388.14,42168.733,44210,Pure-Play,14,1,1,9,1,1,5.333333,3,0.02
4,Bio Revolution,2023-02-28,100.0,4297801682,12801.4084,438.76,403.6592,44210,Pure-Play,3,4,10,1,3,6,5.666667,4,0.02


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)


Snapshot Date
2023-02-28    1.0
Name: Bio Revolution, dtype: float64
(50, 5)
100
Clean Energy (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 61
The number of eligible diversified stocks: 34


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "MVC_rank"] = df_sub.groupby("Snapshot Date")["MVC (Missing Filled with MV)"].rank("first",
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Index(['Index', 'Snapshot Date', 'Clean Energy (Exposure %)', 'Final ID',
       'E006', 'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'Clean Energy (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank',
       'MVC_rank', 'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group',
       'rank_avg', 'rank_final'],
      dtype='object')
0.9999999999999982


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = df_sub.groupby(['Snapshot Date'])['rank_final'].cumsum()


Unnamed: 0,Index,Snapshot Date,Clean Energy (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,Clean Energy (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight by type
0,Clean Energy,2023-02-28,100.0,4297089638,27912750.0,650887.2,566271.864,44224,Pure-Play,3,1,1,1,1,1,1.666667,1,0.013115
1,Clean Energy,2023-02-28,100.0,4298065499,1057339.0,28615.98,26899.0212,44224,Pure-Play,4,2,4,2,2,2,3.333333,2,0.013115
2,Clean Energy,2023-02-28,100.0,5037344406,481649.7,17896.21,10558.7639,44231,Pure-Play,5,4,6,2,1,1,5.0,3,0.013115
3,Clean Energy,2023-02-28,100.0,5076109427,307538.6,16712.01,6517.6839,44231,Pure-Play,6,8,8,3,2,2,7.333333,4,0.013115
4,Clean Energy,2023-02-28,100.0,4295914572,245965.5,8823.77,7500.2045,44237,Pure-Play,7,9,9,1,1,1,8.333333,5,0.013115


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)


Snapshot Date
2023-02-28    1.0
Name: Clean Energy, dtype: float64
(95, 5)
50
Cloud (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 26
The number of eligible diversified stocks: 20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "MVC_rank"] = df_sub.groupby("Snapshot Date")["MVC (Missing Filled with MV)"].rank("first",
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Index(['Index', 'Snapshot Date', 'Cloud (Exposure %)', 'Final ID', 'E006',
       'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'Cloud (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank', 'MVC_rank',
       'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group', 'rank_avg',
       'rank_final'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascending=[True, True, True], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = df_sub.groupby(['Snapshot Date'])['rank_final'].cumsum()


Unnamed: 0,Index,Snapshot Date,Cloud (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,Cloud (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,exp weight by type
0,Cloud,2023-02-28,100.0,4295900491,280561.8423,63860.53,60028.8982,46081,Pure-Play,2,4,1,1,1,1,2.333333,1,0.034696
1,Cloud,2023-02-28,100.0,5037627667,362616.6031,24461.23,24461.23,46083,Pure-Play,1,3,6,1,1,2,3.333333,2,0.034696
2,Cloud,2023-02-28,100.0,5042238586,278220.153,42497.03,36122.4755,46082,Pure-Play,3,5,4,1,1,1,4.0,3,0.034696
3,Cloud,2023-02-28,100.0,4295909064,205623.7143,30347.3,28526.462,46084,Pure-Play,4,6,5,1,1,1,5.0,4,0.034696
4,Cloud,2023-02-28,93.532662,5044027756,810083.499,49648.6,49648.6,46086,Pure-Play,17,1,2,1,1,1,6.666667,5,0.032452


Snapshot Date
2023-02-28    1.0
Name: Cloud, dtype: float64
(46, 5)
50
Cybersecurity (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 23
The number of eligible diversified stocks: 16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Index(['Index', 'Snapshot Date', 'Cybersecurity (Exposure %)', 'Final ID',
       'E006', 'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'Cybersecurity (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank',
       'MVC_rank', 'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group',
       'rank_avg', 'rank_final'],
      dtype='object')
0.9999999999999996


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascending=[True, True, True], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = df_sub.groupby(['Snapshot Date'])['rank_final'].cumsum()


Unnamed: 0,Index,Snapshot Date,Cybersecurity (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,Cybersecurity (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight by type
0,Cybersecurity,2023-02-28,100.0,4296000356,713762.5169,57002.23,57002.23,44240,Pure-Play,1,1,1,1,1,1,1.0,1,0.034783
1,Cybersecurity,2023-02-28,100.0,4295935222,285485.2161,46604.86,37749.9366,44240,Pure-Play,3,4,2,3,4,2,3.0,2,0.034783
2,Cybersecurity,2023-02-28,100.0,5036689323,319741.7407,19032.51,11990.4813,44240,Pure-Play,2,3,5,2,3,5,3.333333,3,0.034783
3,Cybersecurity,2023-02-28,100.0,5000589109,278316.4853,37029.7,27401.978,44240,Pure-Play,4,5,3,4,5,3,4.0,4,0.034783
4,Cybersecurity,2023-02-28,100.0,4295908065,76809.1216,12469.41,10598.9985,44240,Pure-Play,5,9,8,5,7,6,7.333333,5,0.034783


Snapshot Date
2023-02-28    1.0
Name: Cybersecurity, dtype: float64
(39, 5)
50
Defensive (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 79
++++++++++Pure Play+++++++++
Index(['Index', 'Snapshot Date', 'Defensive (Exposure %)', 'Final ID', 'E006',
       'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'Defensive (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank',
       'MVC_rank', 'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group',
       'rank_avg', 'rank_final'],
      dtype='object')
2023-02-28
50


Unnamed: 0,Index,Snapshot Date,Defensive (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,Defensive (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight
0,Defensive,2023-02-28,100.0,4295904414,1014705.0,296389.4,248967.096,46128,Pure-Play,1,3,4,1,2,2,2.666667,1,0.02
1,Defensive,2023-02-28,100.0,5037613143,905213.9,272310.8,258695.26,46128,Pure-Play,2,6,5,2,4,3,4.333333,2,0.02
2,Defensive,2023-02-28,100.0,4295904718,852661.8,238994.3,227044.585,46127,Pure-Play,3,8,8,1,4,4,6.333333,3,0.02
3,Defensive,2023-02-28,100.0,4295903091,812931.1,257458.4,234287.144,46127,Pure-Play,4,9,7,2,5,3,6.666667,4,0.02
4,Defensive,2023-02-28,100.0,4295905537,672860.4,123700.9,116278.846,46128,Pure-Play,5,11,14,3,5,6,10.0,5,0.02


Snapshot Date
2023-02-28    1.0
Name: Defensive, dtype: float64
(50, 5)
50
Digital Health (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 28
The number of eligible diversified stocks: 23


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Index(['Index', 'Snapshot Date', 'Digital Health (Exposure %)', 'Final ID',
       'E006', 'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'Digital Health (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank',
       'MVC_rank', 'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group',
       'rank_avg', 'rank_final'],
      dtype='object')
0.999999999999999


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, 'rank_avg'] = df_sub[["Exp_rank", "ADTV_rank", "MVC_rank"]].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascending=[True, True, True], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = 1
A value is trying to be set on a copy of a 

Unnamed: 0,Index,Snapshot Date,Digital Health (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,Digital Health (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight by type
0,Digital Health,2023-02-28,100.0,4295902139,292067.6741,42895.77,40322.0238,46054,Pure-Play,1,1,1,1,1,1,1.0,1,0.028571
1,Digital Health,2023-02-28,100.0,4295899512,64117.2176,8800.83,5808.5478,46054,Pure-Play,2,5,4,2,3,3,3.666667,2,0.028571
2,Digital Health,2023-02-28,100.0,4298034272,47027.3245,5916.59,3017.4609,46063,Pure-Play,3,8,6,1,3,2,5.666667,3,0.028571
3,Digital Health,2023-02-28,92.045868,4295902738,116380.4253,19218.71,13645.2841,46054,Pure-Play,17,4,3,6,2,2,8.0,4,0.028571
4,Digital Health,2023-02-28,100.0,4296286399,32407.1938,3555.85,3022.4725,46054,Pure-Play,5,10,10,4,5,4,8.333333,5,0.028571


Snapshot Date
2023-02-28    1.0
Name: Digital Health, dtype: float64
(50, 5)
50
E-Commerce (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 36
The number of eligible diversified stocks: 18


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Index(['Index', 'Snapshot Date', 'E-Commerce (Exposure %)', 'Final ID', 'E006',
       'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'E-Commerce (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank',
       'MVC_rank', 'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group',
       'rank_avg', 'rank_final'],
      dtype='object')
2023-02-28
50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, 'rank_avg'] = df_sub[["Exp_rank", "ADTV_rank", "MVC_rank"]].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascending=[True, True, True], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = 1
A value is trying to be set on a copy of a 

Unnamed: 0,Index,Snapshot Date,E-Commerce (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,E-Commerce (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight
0,E-Commerce,2023-02-28,100.0,5045870012,1115151.0,83269.0,83269.0,46107,Pure-Play,3,2,5,1,1,1,3.333333,1,0.02
1,E-Commerce,2023-02-28,100.0,5001437821,670851.8,79013.69,75063.0055,46100,Pure-Play,4,6,6,1,3,2,5.333333,2,0.02
2,E-Commerce,2023-02-28,100.0,4295912318,552539.0,156923.6,149077.42,46104,Pure-Play,17,7,2,7,1,1,8.666667,3,0.02
3,E-Commerce,2023-02-28,94.533674,4295914598,713357.8,95024.44,95024.44,46100,Pure-Play,20,4,4,5,1,1,9.333333,4,0.02
4,E-Commerce,2023-02-28,100.0,5045821058,280967.8,21429.55,16715.049,46100,Pure-Play,6,11,13,3,5,6,10.0,5,0.02


Snapshot Date
2023-02-28    1.0
Name: E-Commerce, dtype: float64
(50, 5)
100
Inflation (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 42
The number of eligible diversified stocks: 16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Index(['Index', 'Snapshot Date', 'Inflation (Exposure %)', 'Final ID', 'E006',
       'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'Inflation (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank',
       'MVC_rank', 'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group',
       'rank_avg', 'rank_final'],
      dtype='object')
0.9999999999999998


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascending=[True, True, True], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['rank_final'] = df_sub.groupby(['Snapshot Date'])['rank_final'].cumsum()


Unnamed: 0,Index,Snapshot Date,Inflation (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,Inflation (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight by type
0,Inflation,2023-02-28,100.0,4295903051,727291.8,125960.6,119662.57,46019,Pure-Play,1,4,4,1,3,2,3.0,1,0.019048
1,Inflation,2023-02-28,100.0,4295904645,768876.2,52708.23,41112.4194,46019,Pure-Play,5,3,9,5,2,5,5.666667,2,0.019048
2,Inflation,2023-02-28,100.0,4295912113,461490.7,66424.5,63103.275,46019,Pure-Play,2,11,6,2,7,3,6.333333,3,0.019048
3,Inflation,2023-02-28,97.819842,4297089638,27912750.0,650887.2,566271.864,46023,Pure-Play,22,1,1,3,1,1,8.0,4,0.019048
4,Inflation,2023-02-28,100.0,4295908588,501864.2,58615.56,51581.6928,46019,Pure-Play,8,9,8,8,6,4,8.333333,5,0.019048


Snapshot Date
2023-02-28    1.0
Name: Inflation, dtype: float64
(58, 5)
100
Infrastructure (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 141
Index(['Index', 'Snapshot Date', 'Infrastructure (Exposure %)', 'Final ID',
       'E006', 'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'Infrastructure (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank',
       'MVC_rank', 'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group',
       'rank_avg', 'rank_final'],
      dtype='object')
2023-02-28
100


Unnamed: 0,Index,Snapshot Date,Infrastructure (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,Infrastructure (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight
0,Infrastructure,2023-02-28,100.0,4295903239,299641.1,68666.56,61113.2384,46140,Pure-Play,2,13,5,1,7,4,6.666667,1,0.01
1,Infrastructure,2023-02-28,100.0,5001437767,336858.8,46453.43,44130.7585,46140,Pure-Play,8,10,10,4,5,8,9.333333,2,0.01
2,Infrastructure,2023-02-28,100.0,4298065499,1057339.0,28615.98,26899.0212,46131,Pure-Play,7,1,21,3,1,6,9.666667,3,0.01
3,Infrastructure,2023-02-28,100.0,4295904675,233593.6,38522.18,30047.3004,46140,Pure-Play,10,21,14,5,11,11,15.0,4,0.01
4,Infrastructure,2023-02-28,100.0,4297906412,324082.9,17850.12,17850.12,46131,Pure-Play,9,11,39,4,6,13,19.666667,5,0.01


Snapshot Date
2023-02-28    1.0
Name: Infrastructure, dtype: float64
(100, 5)
50
IoT (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 21
The number of eligible diversified stocks: 28


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Index(['Index', 'Snapshot Date', 'IoT (Exposure %)', 'Final ID', 'E006',
       'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'IoT (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank', 'MVC_rank',
       'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group', 'rank_avg',
       'rank_final'],
      dtype='object')
0.999999999999999


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, "MVC_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:, 'rank_avg'] = df_sub[["Exp_rank", "ADTV_rank", "MVC_rank"]].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascendi

Unnamed: 0,Index,Snapshot Date,IoT (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,IoT (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight by type
0,IoT,2023-02-28,100.0,4295905486,219948.7317,17736.75,16672.545,46089,Pure-Play,2,3,3,1,1,1,2.666667,1,0.038095
1,IoT,2023-02-28,100.0,5042255248,132385.6588,10077.82,8868.4816,46089,Pure-Play,3,4,5,2,2,2,4.0,2,0.038095
2,IoT,2023-02-28,100.000273,5046010807,21198.9048,8654.86,8135.5684,46094,Pure-Play,1,9,6,1,3,2,5.333333,3,0.038095
3,IoT,2023-02-28,82.675107,4295902139,292067.6741,42895.77,40322.0238,46096,Pure-Play,14,1,2,1,1,1,5.666667,4,0.038095
4,IoT,2023-02-28,100.0,4296920438,56070.5665,3436.84,2646.3668,46094,Pure-Play,4,7,9,2,2,3,6.666667,5,0.038095


Snapshot Date
2023-02-28    1.0
Name: IoT, dtype: float64
(49, 5)
10
Real Asset (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 284
Index(['Index', 'Snapshot Date', 'Real Asset (Exposure %)', 'Final ID', 'E006',
       'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'Real Asset (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank',
       'MVC_rank', 'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group',
       'rank_avg_group', 'rank_final'],
      dtype='object')
2023-02-28
67


Unnamed: 0,Index,Snapshot Date,Real Asset (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,Real Asset (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg_group,rank_final,equal weight
0,Real Asset,2023-02-28,100.0,4295903946,159229.7261,23670.23,20119.6955,46156,Pure-Play,38,55,48,1,1,2,1.333333,1,0.014925
1,Real Asset,2023-02-28,100.0,4295903337,118338.4381,25924.97,22036.2245,46160,Pure-Play,45,73,44,5,8,4,5.666667,1,0.014925
2,Real Asset,2023-02-28,100.0,4295904924,193151.5065,39915.05,37120.9965,46146,Pure-Play,33,43,27,5,2,2,3.0,1,0.014925
3,Real Asset,2023-02-28,100.0,4295903051,727291.785,125960.6,119662.57,46196,Pure-Play,6,5,5,1,4,3,2.666667,1,0.014925
4,Real Asset,2023-02-28,100.0,4295904675,233593.6339,38522.18,30047.3004,46165,Pure-Play,31,37,28,2,6,6,4.666667,1,0.014925


Snapshot Date
2023-02-28    1.0
Name: Real Asset, dtype: float64
(67, 5)
50
SaaS (Exposure %) +++++++++++++++++++++++++++++++++++
The number of eligible pure-play stocks: 98
Index(['Index', 'Snapshot Date', 'SaaS (Exposure %)', 'Final ID', 'E006',
       'MVC (Missing Filled with MV)', 'Float MCAP_snap',
       'SaaS (Primary Group)', 'Type', 'Exp_rank', 'ADTV_rank', 'MVC_rank',
       'Exp_rank_group', 'ADTV_rank_group', 'MVC_rank_group', 'rank_avg',
       'rank_final'],
      dtype='object')
2023-02-28
50


Unnamed: 0,Index,Snapshot Date,SaaS (Exposure %),Final ID,E006,MVC (Missing Filled with MV),Float MCAP_snap,SaaS (Primary Group),Type,Exp_rank,ADTV_rank,MVC_rank,Exp_rank_group,ADTV_rank_group,MVC_rank_group,rank_avg,rank_final,equal weight
0,SaaS,2023-02-28,100.0,4295905431,1008337.0,148304.3,140889.085,46117,Pure-Play,2,2,3,2,1,1,2.333333,1,0.02
1,SaaS,2023-02-28,100.0,5037627667,362616.6,24461.23,24461.23,46124,Pure-Play,3,10,10,1,1,1,7.666667,2,0.02
2,SaaS,2023-02-28,100.0,5036689323,319741.7,19032.51,11990.4813,46121,Pure-Play,4,11,13,1,5,5,9.333333,3,0.02
3,SaaS,2023-02-28,100.0,5038054958,275941.0,22302.5,22302.5,46120,Pure-Play,6,14,11,1,2,2,10.333333,4,0.02
4,SaaS,2023-02-28,100.0,4297160277,315396.9,12335.89,12335.89,46114,Pure-Play,5,12,18,1,3,4,11.666667,5,0.02


Snapshot Date
2023-02-28    1.0
Name: SaaS, dtype: float64
(50, 5)
(1635, 14)
(1635, 17)
(1635, 20)
(1635, 23)
(1635, 26)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Snapshot Date'] = df2['Snapshot Date'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Final ID'] = df2['Final ID'].astype(str)
  return df1.merge(df2, how='left', on=['Snapshot Date','Final ID'])
  return df1.merge(df2, how='left', on=['Snapshot Date','Final ID'])
  return df1.merge(df2, how='left', on=['Snapshot Date','Final ID'])


(1635, 29)
(1635, 32)
(1635, 35)
(1635, 38)
(1635, 41)
(1635, 44)
(1635, 47)


  return df1.merge(df2, how='left', on=['Snapshot Date','Final ID'])
  return df1.merge(df2, how='left', on=['Snapshot Date','Final ID'])


In [3]:
curr = cur[cur['Battery'] > 0]
curr.groupby('snapshot date')['Final ID'].nunique()

snapshot date
2023-02-28    50
Name: Final ID, dtype: int64

In [5]:
curr.loc[curr['Group'] == '41763', 'Exposure'] #= 'Diversified'

Series([], Name: Exposure, dtype: object)

In [7]:
curr_group = pd.DataFrame(curr.groupby('snapshot date')['Exposure'].value_counts(normalize=True))
curr_group

Unnamed: 0_level_0,Unnamed: 1_level_0,Exposure
snapshot date,Exposure,Unnamed: 2_level_1
2013-02-28,Diversified,0.666667
2013-02-28,Pure-Play,0.333333
2013-05-31,Diversified,0.640000
2013-05-31,Pure-Play,0.360000
2013-08-30,Diversified,0.640000
...,...,...
2022-05-31,Diversified,0.340000
2022-08-31,Pure-Play,0.640000
2022-08-31,Diversified,0.360000
2022-11-30,Pure-Play,0.638298


In [8]:
curr = cur[cur['Battery'] > 0]
curr.groupby('snapshot date')['Final ID'].nunique()

snapshot date
2013-02-28    24
2013-05-31    25
2013-08-30    25
2013-11-29    25
2014-02-28    26
2014-05-30    27
2014-08-29    27
2014-11-28    26
2015-02-27    25
2015-05-29    25
2015-08-31    25
2015-11-30    24
2016-02-29    23
2016-05-31    23
2016-08-31    25
2016-11-30    23
2017-02-28    25
2017-05-31    24
2017-08-31    22
2017-11-30    24
2018-02-28    24
2018-05-31    24
2018-08-31    28
2018-11-30    28
2019-02-28    29
2019-05-31    32
2019-08-30    30
2019-11-29    30
2020-02-28    29
2020-05-29    32
2020-08-31    38
2020-11-30    46
2021-02-26    50
2021-05-28    50
2021-08-31    50
2021-11-30    50
2022-02-28    50
2022-05-31    50
2022-08-31    50
2022-11-30    47
Name: Final ID, dtype: int64

In [20]:
def constituent_cap(tf_df, input_df, index_name, rank_method, pure_play, diversified_group, group_rank):
    '''change 50, 20 as varible exp_cutoff'''
    
    df = get_exp_true(index_name, tf_df, input_df)
    
    df['Type'] = np.where(df[index_name] >= 50, 'Pure-Play', 'Diversified')
    
    if diversified_group != 11111111:
        df['Type_Initial'] = np.where(df[index_name] >= 50, 'Pure-Play', 'Diversified')
        df.loc[((df[index_name[:-13] + ' (Primary Group)'] == diversified_group) & (df[index_name] >= 20)), 'Type'] = 'Diversified'
        
    if pure_play: 
        df_sub = df[df['Type'] == 'Pure-Play']
        print('The number of eligible pure-play stocks:', len(df_sub))
    
    else:  
        df_sub = df[df['Type'] == 'Diversified']
        print('The number of eligible diversified stocks:', len(df_sub))
        
    # obtain the ranks based on exposure, adtv, mvc
    
    df_sub['Exp_rank'] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date']).cumcount(ascending=False)+1)
    
    #df_sub.loc[:, "Exp_rank"] = df_sub.groupby("Snapshot Date")[index_name].rank("min", ascending=False).astype(int)
    df_sub.loc[:, "ADTV_rank"] = df_sub.groupby("Snapshot Date")["E006"].rank("first", ascending=False).astype(int)
    df_sub.loc[:, "MVC_rank"] = df_sub.groupby("Snapshot Date")["MVC (Missing Filled with MV)"].rank("first", 
                                                                                              ascending=False).astype(int)

    # obtain the group ranks based on exposure, adtv, mvc
    
    
    #df_sub.loc[:, "Exp_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                            #  ])[index_name].rank("min", ascending=False).astype(int)
        
    df_sub.loc[:, "Exp_rank_group"] = (df_sub.sort_values([index_name,'E006']).groupby(['Snapshot Date', index_name[:-13] + " (Primary Group)"]).cumcount(ascending=False)+1)
    df_sub.loc[:, "ADTV_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                               ])["E006"].rank("first", ascending=False).astype(int)
    df_sub.loc[:, "MVC_rank_group"] = df_sub.groupby(["Snapshot Date", index_name[:-13] + " (Primary Group)"
                                              ])["MVC (Missing Filled with MV)"].rank("first", ascending=False).astype(int)

    # obtain average rank as the average of exp, adtv, mvc ranks
    if rank_method == 1:
        df_sub.loc[:, 'rank_avg'] = df_sub[["ADTV_rank", "MVC_rank"]].mean(axis=1)
    elif rank_method == 2:
        df_sub.loc[:, 'rank_avg'] = df_sub[["Exp_rank", "ADTV_rank", "MVC_rank"]].mean(axis=1)
    elif rank_method == 3:  # obtian the average rank by group
        df_sub.loc[:, 'rank_avg_group'] = df_sub[["ADTV_rank_group", "MVC_rank_group"]].mean(axis=1)
    elif rank_method == 4:
        df_sub.loc[:, 'rank_avg_group'] = df_sub[["Exp_rank_group", "ADTV_rank_group", "MVC_rank_group"]].mean(axis=1)
    else:
        df_sub.loc[:, 'rank_avg'] = 1
    
    if group_rank:
        df_sub.sort_values(['Snapshot Date', 'rank_avg_group', 'ADTV_rank_group'], ascending=[True, True, True], inplace=True)
    else:
        df_sub.sort_values(['Snapshot Date', 'rank_avg', 'ADTV_rank'], ascending=[True, True, True], inplace=True)
    
    df_sub['rank_final'] = 1
    
    if group_rank:
        df_sub['rank_final'] = df_sub.groupby(['Snapshot Date', index_name[:-13] + " (Primary Group)"])['rank_final'].cumsum()
    else:
        df_sub['rank_final'] = df_sub.groupby(['Snapshot Date'])['rank_final'].cumsum()
    
    df_sub.reset_index(drop=True, inplace=True)

    return df_sub


In [21]:
test = final_select_df[final_select_df['Snapshot Date'] == '2022-08-31']
test2 = test.sort_values('Exp_rank')

In [40]:
xx2 = xx2.sort_values('Exp_rank')
xx2.to_excel('C:\\Users\\rzhou\\Downloads\\test_ranking_infra2.xlsx')

In [39]:
len(xx2)

138

In [5]:
"""13 COMPOSITE THEMATIC INDEDICES"""

# Exposure cutoff of each index
#exp1_ls = [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]
exp2_ls = [20, 20, 20, 20, 20, 20, 50, 50, 20, 20, 20, 20, 20]
for i in range(13):
    print(index_list[i], exp2_ls[i])
    
target_ct = len(df2) # set the target count to be same as the number of rows of the input file
final_ls = []
#index_list2 = index_list[3:]
for i in range(len(index_list)):
    index_name = index_list[i] 
    index_exp = index_name + ' (Exposure %)'
    index_gp = index_name + ' (Primary Group)'
    exp2 = exp2_ls[i]
    
    # Select companies that have exposure greater than the threshold
    final_select = input_df[input_df[index_exp] >= exp2]
    
    print(index_name, len(final_select))
    
    # Calculate weight
    final_select['fmcap_weight'] = final_select.groupby('Snapshot Date')['Float MCAP'].apply(lambda x: x/x.sum())
    final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
    
    # Label the type
    final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
    
    a2 = final_select[['Final ID', 'Snapshot Date', 'Type', index_gp, 'fmcap_weight2']].rename({'Type':'Exposure', 
                            index_gp:'Group', 'fmcap_weight2':index_name+'_'+str(exp2)}, axis=1, inplace=False)
    print(a2.groupby('Snapshot Date')[index_name+'_'+str(exp2)].sum())
    print(a2.shape)
    
    a2['Final ID'] = a2['Final ID'].astype(str)
    a2['Snpahot Date'] = a2['Snapshot Date'].astype(str)
    final_ls.append(a2)

df2['Snapshot Date'] = df2['Snapshot Date'].astype(str)
df2['Final ID'] = df2['Final ID'].astype(str)
final_ls[0]['Snapshot Date'] = final_ls[0]['Snapshot Date'].astype(str)

# Merge the 13 tables of final selection
cur = df2.merge(final_ls[0], how='left', on=['Snapshot Date','Final ID'])
print(cur.shape)
for i in final_ls[1:]:
    temp = cur.copy()
    i['Snapshot Date'] = i['Snapshot Date'].astype(str)
    cur = merge_all(temp, i)
    print(cur.shape)
        
cur.rename({'Snapshot Date':'snapshot date', 'Rebal Date':'ex date', 'Weight Date':'weight date', 'DSCD':'ticker', 
           'Ex Date':'open date'}, axis=1, inplace=True)
cur.fillna(0, inplace=True)
cur.to_excel('C:\\Users\\rzhou\\Downloads\\historical basket_comp.xlsx')

Cybersecurity 20
Cloud 20
IoT 20
Battery 20
Bio Revolution 20
SaaS 20
Defensive 50
Real Asset 50
Digital Health 20
E-Commerce 20
Clean Energy 20
Infrastructure 20
Inflation 20
Cybersecurity 48


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight'] = final_select.groupby('Snapshot Date')['Float MCAP'].apply(lambda x: x/x.sum())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

Snapshot Date
2023-02-28    1.0
Name: Cybersecurity_20, dtype: float64
(48, 5)
Cloud 57


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Snapshot Date
2023-02-28    1.0
Name: Cloud_20, dtype: float64
(57, 5)
IoT 54


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Snapshot Date
2023-02-28    1.0
Name: IoT_20, dtype: float64
(54, 5)
Battery 67


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Snapshot Date
2023-02-28    1.0
Name: Battery_20, dtype: float64
(67, 5)
Bio Revolution 60


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Snapshot Date
2023-02-28    1.0
Name: Bio Revolution_20, dtype: float64
(60, 5)
SaaS 172
Snapshot Date
2023-02-28    1.0
Name: SaaS_20, dtype: float64
(172, 5)
Defensive 265


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight'] = final_select.groupby('Snapshot Date')['Float MCAP'].apply(lambda x: x/x.sum())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fina

Snapshot Date
2023-02-28    1.0
Name: Defensive_50, dtype: float64
(265, 5)
Real Asset 345
Snapshot Date
2023-02-28    1.0
Name: Real Asset_50, dtype: float64
(345, 5)
Digital Health 73


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Snapshot Date
2023-02-28    1.0
Name: Digital Health_20, dtype: float64
(73, 5)
E-Commerce 157


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Snapshot Date
2023-02-28    1.0
Name: E-Commerce_20, dtype: float64
(157, 5)
Clean Energy 98


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Snapshot Date
2023-02-28    1.0
Name: Clean Energy_20, dtype: float64
(98, 5)
Infrastructure 247
Snapshot Date
2023-02-28    1.0
Name: Infrastructure_20, dtype: float64
(247, 5)
Inflation 378


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Snapshot Date
2023-02-28    1.0
Name: Inflation_20, dtype: float64
(378, 5)
(1635, 12)
(1635, 16)
(1635, 20)
(1635, 24)
(1635, 28)
(1635, 32)
(1635, 36)


  return df1.merge(df2, how='left', on=['Snapshot Date','Final ID'])
  return df1.merge(df2, how='left', on=['Snapshot Date','Final ID'])
  return df1.merge(df2, how='left', on=['Snapshot Date','Final ID'])


(1635, 40)
(1635, 44)
(1635, 48)
(1635, 52)
(1635, 56)
(1635, 60)


In [7]:
"""SINGLE COMPOSITE THEMATIC INDEDICES"""

# Exposure cutoff of each index
#exp1_ls = [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]
exp2_ls = [50]
    
target_ct = len(df2) # set the target count to be same as the number of rows of the input file
final_ls = []
#index_list2 = index_list[3:]
for i in range(len(index_list)):
    index_name = index_list[i] 
    index_exp = index_name + ' (Exposure %)'
    index_gp = index_name + ' (Primary Group)'
    exp2 = exp2_ls[i]
    
    # Select companies that have exposure greater than the threshold
    final_select = input_df[input_df[index_exp] >= exp2]
    
    print(index_name, len(final_select))
    
    # Calculate weight
    final_select['fmcap_weight'] = final_select.groupby('Snapshot Date')['Float MCAP'].apply(lambda x: x/x.sum())
    final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
    
    # Label the type
    final_select['Type'] = np.where(final_select[index_exp] >= 50, 'Pure-Play', 'Diversified')
    
    a2 = final_select[['Final ID', 'Snapshot Date', 'Type', index_gp, 'fmcap_weight2']].rename({'Type':'Exposure', 
                            index_gp:'Group', 'fmcap_weight2':index_name+'_'+str(exp2)}, axis=1, inplace=False)
    print(a2.groupby('Snapshot Date')[index_name+'_'+str(exp2)].sum())
    print(a2.shape)
    
    a2['Final ID'] = a2['Final ID'].astype(str)
    a2['Snpahot Date'] = a2['Snapshot Date'].astype(str)
    final_ls.append(a2)

df2['Snapshot Date'] = df2['Snapshot Date'].astype(str)
df2['Final ID'] = df2['Final ID'].astype(str)
final_ls[0]['Snapshot Date'] = final_ls[0]['Snapshot Date'].astype(str)

# Merge the 13 tables of final selection
cur = df2.merge(final_ls[0], how='left', on=['Snapshot Date','Final ID'])
print(cur.shape)
        
cur.rename({'Snapshot Date':'snapshot date', 'Rebal Date':'ex date', 'Weight Date':'weight date', 'DSCD':'ticker', 
           'Ex Date':'open date'}, axis=1, inplace=True)
cur.fillna(0, inplace=True)
cur.to_excel('C:\\Users\\rzhou\\Downloads\\historical basket_defensive_comp.xlsx')

Defensive 5614


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight'] = final_select.groupby('Snapshot Date')['Float MCAP'].apply(lambda x: x/x.sum())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_select['fmcap_weight2'] = final_select.groupby('Snapshot Date')['fmcap_weight'].apply(lambda x: redistr_weights(x, 0, 0.045))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

Snapshot Date
2006-11-30    1.0
2007-02-28    1.0
2007-05-31    1.0
2007-08-31    1.0
2007-11-30    1.0
2008-02-29    1.0
2008-05-30    1.0
2008-08-29    1.0
2008-11-28    1.0
2009-02-27    1.0
2009-05-29    1.0
2009-08-31    1.0
2009-11-30    1.0
2010-02-26    1.0
2010-05-28    1.0
2010-08-31    1.0
2010-11-30    1.0
2011-02-28    1.0
2011-05-31    1.0
2011-08-31    1.0
2011-11-30    1.0
2012-02-29    1.0
2012-05-31    1.0
2012-08-31    1.0
2012-11-30    1.0
Name: Defensive_50, dtype: float64
(5614, 5)
(10146, 12)


In [16]:
### QA

for i in range(13):
    indexname = index_list[i]
    print(indexname, len(input_df[(input_df[indexname+' (Exposure %)'] >= exp2_ls[i]) & (~input_df['Float MCAP'].isna())]))

Cybersecurity 51
Cloud 68
IoT 64
Battery 77
Bio Revolution 66
SaaS 176
Defensive 84
Real Asset 49
Digital Health 86
E-Commerce 198
Clean Energy 133
Infrastructure 316
Inflation 499


In [26]:
lp2 = cur[['snapshot date', 'Digital Health_20', 'Bio Revolution_20', 'Battery_20', 'Infrastructure_20', 'Cybersecurity_20', 'Real Asset_50',
              'IoT_20','SaaS_20', 'Defensive_50', 'Inflation_20', 'Clean Energy_20', 'Cloud_20', 'E-Commerce_20']]
a = lp2.groupby('snapshot date').apply(lambda x: x.astype(bool).sum(axis=0))
a.to_excel('C:\\Users\\rzhou\\Downloads\\QA_composite_March13.xlsx')

In [8]:
cur = pd.read_excel('C:\\Users\\rzhou\\Downloads\\historical basket_thematic_March13.xlsx')

In [9]:
cur.columns

Index(['snapshot date', 'ex date', 'open date', 'weight date', 'OPID',
       'ticker', 'ISIN', 'Final ID', 'Battery', 'Exposure', 'Group',
       'Bio Revolution', 'Exposure.1', 'Group.1', 'Clean Energy', 'Exposure.2',
       'Group.2', 'Cloud', 'Exposure.3', 'Group.3', 'Cybersecurity',
       'Exposure.4', 'Group.4', 'Defensive', 'Exposure.5', 'Group.5',
       'Digital Health', 'Exposure.6', 'Group.6', 'E-Commerce', 'Exposure.7',
       'Group.7', 'Inflation', 'Exposure.8', 'Group.8', 'Infrastructure',
       'Exposure.9', 'Group.9', 'IoT', 'Exposure.10', 'Group.10', 'Real Asset',
       'Exposure.11', 'Group.11', 'SaaS', 'Exposure.12', 'Group.12'],
      dtype='object')

In [13]:
cur['Group.11'].value_counts()

0        1571
46196       9
46210       6
46165       5
46146       4
46160       2
46156       2
Name: Group.11, dtype: int64

In [15]:
cur['Real Asset']

0.03571428571428571

In [4]:
import numpy as np 
from datetime import datetime
import pandas as pd 
import math

index_list = ['Cybersecurity', 'Cloud', 'IoT', 'Batteries', 'Bio Revolution',  'SaaS', 
              'Defensive',  'Real Asset', 'Digital Health', 'E-Commerce', 'Clean Energy',
              'Infrastructure', 'Inflation']


In [11]:
k = []
for i in index_list:
    x = pd.read_excel('C:\\Users\\rzhou\\Downloads\\final_26_datapack\\' + i + ' Index Data Package 11-28-2022.xlsx', 'Current Basket', skiprows=[0])
    m = x['SYNTAX CUSTOM TAXONOMY CLASSIFICATION'].value_counts()
    m2 = pd.DataFrame(m)
    m2['Index'] = [i] * len(m)
    k.append(m2)
    print(m)

Cybersecurity Software    21
Cybersecurity Services    11
Name: SYNTAX CUSTOM TAXONOMY CLASSIFICATION, dtype: int64
Cloud Compute Hardware      16
Cloud Services               9
Cloud Software               8
Database Software            4
Data Center & Colocation     3
Name: SYNTAX CUSTOM TAXONOMY CLASSIFICATION, dtype: int64
Semiconductors        15
Industrial            11
Utilities              5
Automotive             5
Smart Home             5
Consumer IoT           4
Smart Buildings        3
Telecommunications     2
Name: SYNTAX CUSTOM TAXONOMY CLASSIFICATION, dtype: int64
Clean Energy Vehicles               25
Fuel Cell                            8
Batteries                            7
Battery Materials                    6
Charging Stations                    3
Diversified Electronic Materials     1
Name: SYNTAX CUSTOM TAXONOMY CLASSIFICATION, dtype: int64
Life Science    30
Biofuels        11
Food             3
Biomaterials     1
Agriculture      1
Name: SYNTAX CUSTOM TAXONO

In [12]:
output = pd.concat(k, axis=0)
output.head()
output.to_excel('C:\\Users\\rzhou\\Downloads\\non_composite_constituent_ct.xlsx')

Unnamed: 0,SYNTAX CUSTOM TAXONOMY CLASSIFICATION,Index
Cybersecurity Software,21,Cybersecurity
Cybersecurity Services,11,Cybersecurity
Cloud Compute Hardware,16,Cloud
Cloud Services,9,Cloud
Cloud Software,8,Cloud


In [15]:
index_list2 = ['Cybersecurity', 'Cloud', 'IoT', 'Battery', 'Bio Revolution',  'SaaS', 
              'Defensive',  'Real Asset', 'Digital Health', 'E-Commerce', 'Clean Energy',
              'Infrastructure', 'Inflation']
exp2_ls = [20, 20, 20, 20, 20, 20, 50, 50, 20, 20, 20, 20, 20]

In [23]:
file_name = 'C:\\Users\\rzhou\\OneDrive\\Documents\\Production_materials_Nov14\\'

In [28]:
k2 = []
for j in range(13):
    i = index_list2[j]
    x = pd.read_excel(file_name + 'production_' + i + '.xlsx', 'Thematic Current Basket')
    x2 = x[x['INDEX WEIGHT as of 2022-11-14'] > 0]
    m = x2['SYNTAX CUSTOM TAXONOMY CLASSIFICATION'].value_counts()
    print(x2['SYNTAX CUSTOM TAXONOMY CLASSIFICATION'].unique())
    m2 = pd.DataFrame(m)
    m2['Index'] = [i] * len(m)
    k2.append(m2)
    print(m)

['Cybersecurity Services' 'Cybersecurity Software']
Cybersecurity Software    21
Cybersecurity Services    11
Name: SYNTAX CUSTOM TAXONOMY CLASSIFICATION, dtype: int64
['Cloud Services' 'Cloud Compute Hardware' 'Database Software'
 'Data Center & Colocation' 'Cloud Software' 'Content Delivery Network']
Cloud Compute Hardware      15
Cloud Software               8
Cloud Services               6
Database Software            6
Data Center & Colocation     4
Content Delivery Network     1
Name: SYNTAX CUSTOM TAXONOMY CLASSIFICATION, dtype: int64
['Consumer IoT' 'Semiconductors' 'Industrial' 'Utilities' 'Automotive'
 'Telecommunications' 'Smart Buildings' 'Smart Home']
Semiconductors        15
Industrial            11
Utilities              5
Automotive             5
Smart Home             5
Consumer IoT           4
Smart Buildings        3
Telecommunications     2
Name: SYNTAX CUSTOM TAXONOMY CLASSIFICATION, dtype: int64
['Clean Energy Vehicles' 'Fuel Cell ' 'Battery Materials' 'Batteries'

In [29]:
output = pd.concat(k2, axis=0)
output.head()
output.to_excel('C:\\Users\\rzhou\\Downloads\\non_composite_constituent_ct.xlsx')

In [36]:

l2 = pd.read_excel('C:\\Users\\rzhou\\Downloads\\non_composite_constituent_ct.xlsx')

In [49]:
l = pd.read_excel('C:\\Users\\rzhou\\Downloads\\composite_constituent_ct.xlsx', 'Sheet1')
l3 = pd.read_excel('C:\\Users\\rzhou\\Downloads\\composite_constituent_ct.xlsx', 'Sheet2')

In [50]:
l3['Index Name'].value_counts()

SaaS              12
IoT                8
Clean Energy       7
Real Asset         7
Battery            6
Cloud              6
Bio Revolution     5
Digital Health     5
Inflation          5
Defensive          4
E-commerce         4
Cyber Security     3
Infrastructure     3
Name: Index Name, dtype: int64

In [53]:
l3.head()

Unnamed: 0,Group,Index Name,Previous_Group
0,Batteries,Battery,Batteries
1,Battery Materials,Battery,Battery Materials
2,Diversified Electronic Materials,Battery,Diversified Electronic Materials
3,Clean Energy Vehicles,Battery,Clean Energy Vehicles
4,Fuel Cells,Battery,Fuel Cells


In [54]:
d = l.merge(l3[['Group', 'Previous_Group']], on=['Group'], how='left')

In [55]:
d.to_excel('C:\\Users\\rzhou\\Downloads\\test_constituent_ct.xlsx')

In [34]:
l['Index'].value_counts()

SaaS              12
IoT                8
E-Commerce         8
Real Asset         7
Clean Energy       7
Cloud              6
Battery            6
Bio Revolution     5
Inflation          5
Defensive          4
Digital Health     4
Infrastructure     3
Cybersecurity      2
Name: Index, dtype: int64

In [37]:
l.head()

Unnamed: 0,Group,SYNTAX CUSTOM TAXONOMY CLASSIFICATION,Index
0,Cybersecurity Software,28,Cybersecurity
1,Cybersecurity Services,13,Cybersecurity
2,Cloud Compute Hardware,19,Cloud
3,Cloud Services,9,Cloud
4,Database Software,8,Cloud


In [39]:
output2 = l.merge(l2, on=['Group', 'Index'], how='left')

In [41]:
output2.to_excel('C:\\Users\\rzhou\\Downloads\\constituent_ct_Nov14.xlsx')

In [59]:
op = pd.read_excel('C:\\Users\\rzhou\\Downloads\\test_constituent_ct.xlsx', 'Comparison')
op2 = pd.read_excel('C:\\Users\\rzhou\\Downloads\\test_constituent_ct.xlsx', 'Previous Group Names')

In [60]:
op = op[op['Index'] != 'E-Commerce']
op2 = op2[op2['Index Name'] != 'E-commerce']

In [61]:
m1 = op.merge(output2, on=['Group'], how='left')


In [64]:
m2 = m1.merge(op2, on=['Previous_Group'], how='left')

In [65]:
m2.head()

Unnamed: 0.1,Unnamed: 0,Group_x,SYNTAX CUSTOM TAXONOMY CLASSIFICATION,Index_x,Previous_Group,Unnamed: 5,SYNTAX CUSTOM TAXONOMY CLASSIFICATION_x,Index_y,SYNTAX CUSTOM TAXONOMY CLASSIFICATION_y,Index Name,Group_y
0,0,Cybersecurity Software,28,Cybersecurity,Cybersecurity Software,,28,Cybersecurity,21.0,Cyber Security,Cybersecurity Software: the development of sof...
1,1,Cybersecurity Services,13,Cybersecurity,Cybersecurity Services,,13,Cybersecurity,11.0,Cyber Security,Cybersecurity Services: the provision of servi...
2,2,Cloud Compute Hardware,19,Cloud,Cloud Compute Hardware,,19,Cloud,15.0,Cloud,Cloud Compute Hardware: the production or dist...
3,3,Cloud Services,9,Cloud,Cloud Services,,9,Cloud,6.0,Cloud,Cloud Services: the provision of on-demand com...
4,4,Database Software,8,Cloud,Database Software,,8,Cloud,6.0,Cloud,Database Software: the development of software...


In [66]:
m2.to_excel('C:\\Users\\rzhou\\Downloads\\constituent_ct_Nov14.xlsx')

In [87]:
k = pd.read_excel('C:\\Users\\rzhou\\Downloads\\constituent_ct_Nov14 (1).xlsx')

In [88]:
k2 = pd.read_excel('C:\\Users\\rzhou\\Downloads\\Groups within Thematics.xlsx', 'Sheet1')

In [89]:
k.head()

Unnamed: 0,Index,Group
0,Battery,Batteries: the manufacture or distribution of ...
1,Battery,Battery Materials: the mining of metals and mi...
2,Battery,Diversified Electronic Materials: the producti...
3,Battery,Clean Energy Vehicles: the manufacture of elec...
4,Battery,Fuel Cells: the manufacture of fuel cells for ...


In [90]:
k['Group2'] = k['Group'].str.split(':').str[0]

In [91]:
k2.head()

Unnamed: 0,Group2,Index,Constituent Count (Composite),Constituent Count (Non-Composite),Previous_Group,Unnamed: 5
0,Cybersecurity Software,Cybersecurity,28.0,21.0,Cybersecurity Software,
1,Cybersecurity Services,Cybersecurity,13.0,11.0,Cybersecurity Services,
2,Cloud Compute Hardware,Cloud,19.0,15.0,Cloud Compute Hardware,
3,Cloud Services,Cloud,9.0,6.0,Cloud Services,
4,Database Software,Cloud,8.0,6.0,Database Software,


In [92]:
output4 = k.merge(k2, on=['Group2', 'Index'], how='left')

In [93]:
output4.to_excel("C:\\Users\\rzhou\\Downloads\\constituent_ct_final.xlsx")

In [25]:
the_ls = ['Digital Health Index 40', 'Bio Revolution Index 33', 'Battery Index 40', 'Infrastructure Index 31', 
          'Cybersecurity Index 40',  'Real Asset Index 7', 'IoT Index 40', 'SaaS Index 33', 'Defensive Index 5', 
          'Inflation Index 38', 'Clean Energy Index 38', 'Cloud Index 54', 'E-Commerce Index 33']

In [24]:
x = pd.read_excel('C:\\Users\\rzhou\\Downloads\\DataPackage_training_thematic1216\\historical basket_Dec16_thematic.xlsx')

In [31]:
x2 = x[the_ls+['snapshot date']]
for i in the_ls:
    x3 = x2[[i, 'snapshot date']]
    x3 = x3[x3[i] > 0]
    print(i, x3.groupby('snapshot date')[i].size())


Digital Health Index 40 snapshot date
2013-02-28    19
2013-05-31    20
2013-08-30    22
2013-11-29    25
2014-02-28    24
2014-05-30    24
2014-08-29    24
2014-11-28    24
2015-02-27    24
2015-05-29    25
2015-08-31    26
2015-11-30    26
2016-02-29    26
2016-05-31    27
2016-08-31    28
2016-11-30    27
2017-02-28    27
2017-05-31    30
2017-08-31    30
2017-11-30    29
2018-02-28    32
2018-05-31    35
2018-08-31    37
2018-11-30    37
2019-02-28    37
2019-05-31    35
2019-08-30    41
2019-11-29    41
2020-02-28    41
2020-05-29    42
2020-08-31    47
2020-11-30    49
2021-02-26    50
2021-05-28    50
2021-08-31    50
2021-11-30    50
2022-02-28    50
2022-05-31    50
2022-08-31    50
2022-11-30    45
Name: Digital Health Index 40, dtype: int64
Bio Revolution Index 33 snapshot date
2013-02-28    20
2013-05-31    20
2013-08-30    21
2013-11-29    22
2014-02-28    24
2014-05-30    24
2014-08-29    25
2014-11-28    24
2015-02-27    23
2015-05-29    24
2015-08-31    25
2015-11-30   

In [56]:
output2.head()

Unnamed: 0,Group,SYNTAX CUSTOM TAXONOMY CLASSIFICATION_x,Index,SYNTAX CUSTOM TAXONOMY CLASSIFICATION_y
0,Cybersecurity Software,28,Cybersecurity,21.0
1,Cybersecurity Services,13,Cybersecurity,11.0
2,Cloud Compute Hardware,19,Cloud,15.0
3,Cloud Services,9,Cloud,6.0
4,Database Software,8,Cloud,6.0
