# Engineer Features on various Datasets

In [127]:
import os
import pandas as pd
import numpy as np
import pickle
import sys
from pandas import HDFStore,DataFrame
from datetime import date, datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
#from matplotlib_venn import venn2


# Use 3 decimal places in output display
pd.set_option("display.precision", 3)

# Don't wrap repr(DataFrame) across additional lines
pd.set_option("display.expand_frame_repr", False)

# Set max rows displayed in output to 25
pd.set_option("display.max_rows", 25)

pd.set_option('display.float_format', lambda x: '%.2f' % x)

product_level = 'Product Level 2'

colors = ['#1F77B4', '#FF7F0E']

path_to_hdf_datastore = '../data/hdf/datastore.h5.old'


In [4]:
def get_data_frame_summmary(data_frame):
        unique_values = data_frame.apply(lambda x: [x.unique()])
        unique_counts = data_frame.apply(lambda x: len(x.unique()))
        percent_missing = data_frame.apply(lambda x: sum(pd.isnull(x))/len(x)*100)
        data_type = data_frame.dtypes 

        return pd.DataFrame(dict(unique_values = unique_values, 
                                unique_counts = unique_counts,
                                data_type = data_type,
                                percent_missing = percent_missing,
                                )).reset_index().sort_values(by='percent_missing', ascending=False)
    
# Function to drop outliers of numeric columns
def scale_numeric_features(data_frame, exclude=[], 
                           method='standardize',
                           inplace=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    num_cols = data_frame.select_dtypes(include=numerics).columns

    num_cols = num_cols.difference(exclude)
    print(f'********************* - Scaling following {len(num_cols)} features - **********************')
    for colname in num_cols:
        new_colname = colname if inplace else colname+'_'+method+'d'
        print(f' {colname} {method}d to {new_colname}')
        
        if method == 'standardize':              
              data_frame[new_colname] = (data_frame[colname] - data_frame[colname].mean()) / data_frame[colname].std()
        elif method == 'normalize':
              data_frame[new_colname] = (data_frame[colname] - data_frame[colname].min()) / (data_frame[colname].max() - data_frame[colname].min())
        else:
              print(f'Unknown method {method} specified, please select one of "standardize" or "normalize"')

      
    return data_frame

In [172]:
# customer ids cols
ECH_ecrid_col = 'ecrid'
journals_ecrid_col = 'SIS Id (Agreement SIS)'
other_ecrid_col = 'SIS Id (Agreement SIS)'
churn_activities_ecrid_col = 'ECR Id'
churn_risks_ecrid_col = 'Account Name: ECR Id'
account_assignment_ecrid_col = 'ECRID'
NPS_ecrid_col = 'ECR_ID'
usage_ecrid_col = 'ECR_ID'
interactions_ecrid_col = 'ECR_ID'
cancellations_ecrid_col = 'SIS Id (Agreement SIS)'

## Transaction Data

In [2]:
# Read Data
journals_contracts = pd.read_hdf(path_to_hdf_datastore, 'DataCRjournals')
other_contracts = pd.read_hdf(path_to_hdf_datastore, 'DataCRother')
cancellations = pd.read_hdf(path_to_hdf_datastore, 'DataCR_from_2015_Cancellations_Source_Systems')

In [5]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(get_data_frame_summmary(journals_contracts))

Unnamed: 0,index,unique_values,unique_counts,data_type,percent_missing
0,Agreement End Date,"[[2015-12-31, 2016-01-31, 2015-05-12, 2016-01-...",2197,object,0.0
1,Agreement Number,"[[DLT1, DLF1, 1-376434, 1-941021, 2-264770, 3-...",275478,object,0.0
20,Product Line Level 3,"[[Journals Print Subscriptions, Journals Subsc...",48,object,0.0
21,Product Line Level 4,"[[Journals Print Subscriptions, Journals Subsc...",74,object,0.0
22,Product Revenue Type,"[[Recurring, One Off]]",2,object,0.0
23,RSO,"[[Member, MAEU, NOAM, Personal, EMEALA, APAC, ...",10,object,0.0
24,Renewal Exp Complete Date,"[[0-00-00, 2015-12-20, 2015-11-15, 2015-09-19,...",1669,object,0.0
25,SIS Id (Agreement SIS),"[[100034, 100097, 1001, 1003, 1004, 1005, 1005...",84056,object,0.0
26,Saleable Product Name (Source),"[[RADIOTHERAPY & ONCOLOGY, RADIOTHERAPY & ONCO...",9723,object,0.0
27,Sales Division (Agreement SIS),"[[HS Europe North, AG-MAEU-North_Europe0-01D, ...",288,object,0.0


In [6]:
# Has Parent Agreement

In [7]:
# inspect Agreements

df = journals_contracts.groupby(
    ['Agreement Number']
).agg(revenue=("Bookings - Final Net Price - Agent Discount Amount(Rep)", sum),
      num_agreements=('Agreement Number', 'count')
    ).sort_values('revenue', ascending=False)

In [8]:
df

Unnamed: 0_level_0,revenue,num_agreements
Agreement Number,Unnamed: 1_level_1,Unnamed: 2_level_1
DLT1,583122724.97,599997
,47023007.46,19632
1-15854951399,42993876.15,4
1-14047592623,41735251.84,4
1-16606333303,40737995.52,4
...,...,...
1-16002767147,-73033.90,121
1-14215235304,-73689.03,115
1-17867591804,-84988.33,123
1-17837848909,-109901.87,115


In [100]:
# number of rfm features per ecr per product

def get_rfm_features_from_contracts(dataframe, ecrid_col, label=None):    

    dataframe = dataframe.groupby(
        [ecrid_col, 'Product Line Level 2']
    ).agg(revenue=("Bookings - Final Net Price - Agent Discount Amount(Rep)", sum),
        num_contracts=('Agreement Number', pd.Series.nunique),
        num_parents = ('Parent Agreement Number', pd.Series.nunique),
        last_agreement = ('Agreement Start Date', max),
        first_agreement = ('Agreement Start Date', min)
    ).sort_values('revenue', ascending=False)

    dataframe['last_agreement'] = pd.to_datetime(dataframe['last_agreement'], format='%Y-%m-%d')
    dataframe['first_agreement'] = pd.to_datetime(dataframe['first_agreement'], format='%Y-%m-%d')
    
    dataframe['days_since_last_agreement'] = dataframe['last_agreement'].apply(
        lambda x: (datetime.today() - x).days
        )
    dataframe['days_since_first_agreement'] = dataframe['first_agreement'].apply(
        lambda x: (datetime.today() - x).days
        )
    dataframe['length_of_relationship'] = dataframe['days_since_first_agreement'] - dataframe['days_since_last_agreement'] 
    dataframe = dataframe.drop(['last_agreement', 'first_agreement'], axis=1)
    
    if label is not None:
        dataframe['label'] = label
    
    return dataframe

In [96]:
# RFM features for Journals

journals_rfm = get_rfm_features_from_contracts(journals_contracts, ecrid_col=journals_ecrid_col, label=0)
journals_rfm

Unnamed: 0_level_0,Unnamed: 1_level_0,revenue,num_contracts,num_parents,days_since_last_agreement,days_since_first_agreement,length_of_relationship,label
HQ SIS Id (Agreement SIS),Product Line Level 2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ECR-490216,Journals Electronic Subscriptions,245815686.07,18,10,387,2213,1826,0
ECR-310775,Journals Electronic Subscriptions,195405546.81,93,72,22,1848,1826,0
ECR-380603,Journals Electronic Subscriptions,84846259.88,25,24,206,1848,1642,0
ECR-71655,Journals Electronic Subscriptions,82842911.10,20,18,1117,1848,731,0
ECR-23550,Journals Electronic Subscriptions,59092277.67,106,99,22,2578,2556,0
...,...,...,...,...,...,...,...,...
ECR-10412319,Bulk Sales,-149388.68,1,1,50,1756,1706,0
ECR-1413,Journals Electronic Subscriptions,-195188.41,6,6,1117,2944,1827,0
ECR-411299,Journals Subscriptions Electronic,-347606.82,1,1,1117,1848,731,0
40083,Bulk Sales,-388788.00,1,1,1022,1326,304,0


In [97]:
# RFM features for Other contracts

contracts_rfm = get_rfm_features_from_contracts(other_contracts, ecrid_col=other_ecrid_col, label=0)
contracts_rfm

Unnamed: 0_level_0,Unnamed: 1_level_0,revenue,num_contracts,num_parents,days_since_last_agreement,days_since_first_agreement,length_of_relationship,label
HQ SIS Id (Agreement SIS),Product Line Level 2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ECR-28091,Nursing & Health Professionals P_ELP,100915395.73,1,1,23,1847,1824,0
ECR-79671,Nursing & Health Professionals P_ELP,84606950.98,1,1,23,1847,1824,0
ECR-10064555,CK Physician,61667031.07,10,7,207,1484,1277,0
ECR-10408016,Nursing & Health Professionals P_ELP,58471162.75,1,1,23,1848,1825,0
ECR-28091,Medical Reference P_ELP,56000773.63,1,1,23,1847,1824,0
...,...,...,...,...,...,...,...,...
ECR-10366988,Medical Education E_LLP,-498569.51,1,1,1200,1500,300,0
939527,Nursing & Health Professionals P_ELP,-692397.27,1,1,1735,1735,0,0
28091,Research Reference P_ELP,-942291.17,1,1,23,1847,1824,0
ECR-10066097,Research Reference P_ELP,-2265029.71,1,1,54,1847,1793,0


In [98]:
# RFM features for Cancellations

cancellations_rfm = get_rfm_features_from_contracts(cancellations, ecrid_col=cancellations_ecrid_col, label=1)
cancellations_rfm

Unnamed: 0_level_0,Unnamed: 1_level_0,revenue,num_contracts,num_parents,days_since_last_agreement,days_since_first_agreement,length_of_relationship,label
HQ SIS Id (Agreement SIS),Product Line Level 2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ECR-1004,Journals Electronic Subscriptions,10094303.60,2,2,752,2213,1461,1
ECR-375272,Journals Electronic Subscriptions,6013894.64,1,1,1848,1848,0,1
ECR-44401,Journals Electronic Subscriptions,5170653.93,2,2,752,1117,365,1
ECR-339079,Journals Electronic Subscriptions,3917394.62,1,1,1483,1483,0,1
ECR-928880,Journals Electronic Subscriptions,3521398.24,1,1,1392,1392,0,1
...,...,...,...,...,...,...,...,...
ECR-365981,Journals Electronic Subscriptions,-0.00,1,1,1483,1483,0,1
ECR-417308,Journal Level Sales,-0.00,1,1,2213,2213,0,1
ECR-737806,Journals Electronic Subscriptions,-0.00,1,1,1848,1848,0,1
ECR-566978,clinics,-0.00,1,1,387,387,0,1


In [99]:
# Concatenate Journals Others and Cancellations

basetable = pd.concat([journals_rfm, contracts_rfm, cancellations_rfm])
basetable

Unnamed: 0_level_0,Unnamed: 1_level_0,revenue,num_contracts,num_parents,days_since_last_agreement,days_since_first_agreement,length_of_relationship,label
HQ SIS Id (Agreement SIS),Product Line Level 2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ECR-490216,Journals Electronic Subscriptions,245815686.07,18,10,387,2213,1826,0
ECR-310775,Journals Electronic Subscriptions,195405546.81,93,72,22,1848,1826,0
ECR-380603,Journals Electronic Subscriptions,84846259.88,25,24,206,1848,1642,0
ECR-71655,Journals Electronic Subscriptions,82842911.10,20,18,1117,1848,731,0
ECR-23550,Journals Electronic Subscriptions,59092277.67,106,99,22,2578,2556,0
...,...,...,...,...,...,...,...,...
ECR-365981,Journals Electronic Subscriptions,-0.00,1,1,1483,1483,0,1
ECR-417308,Journal Level Sales,-0.00,1,1,2213,2213,0,1
ECR-737806,Journals Electronic Subscriptions,-0.00,1,1,1848,1848,0,1
ECR-566978,clinics,-0.00,1,1,387,387,0,1


## ECH Customer Data

In [57]:
# read data
ech = pd.read_hdf(path_to_hdf_datastore, 'ECH_Customer_Data')
hierarchy = pd.read_hdf(path_to_hdf_datastore, 'hierarchy')

In [15]:
#Analyze ECH and Hierarchy

ech_unique_customers = ech['ecrid'].unique()
hierarchy_unique_parents = hierarchy['PARENT_ECR'].unique()
hierarchy_unique_children = hierarchy['CHILD_ECR'].unique()

print(len(ech_unique_customers))
print(len(hierarchy_unique_parents))
print(len(hierarchy_unique_children))


ech_intersect_parents = np.intersect1d(ech_unique_customers, hierarchy_unique_parents, assume_unique=True)
ech_intestsect_children = np.intersect1d(ech_unique_customers, hierarchy_unique_children, assume_unique=True)

print(f'ECH and PARENTS intersection {len(ech_intersect_parents)}')
      
print(f'ECH and CHILDREN intersection {len(ech_intersect_parents)}')



137546
369140
592634
ECH and PARENTS intersection 95895
ECH and CHILDREN intersection 95895


In [25]:
ech.head()

Unnamed: 0,ecrid,name,city,Country ISO,Region,consortium,post_code,Classification
0,ECR-560156,National Institute for Research-Development in...,Timişoara,RO,,,300569,Academic
1,ECR-10349400,Donostia International Physics Center,San Sebastian,ES,,,20018,Government
2,ECR-29390542,Twist Bioscience,San Francisco,US,CA,,94158-2117,Corporate
3,ECR-10234417,University of Pennsylvania Department of Patho...,Philadelphia,US,PA,,19104,Academic
4,ECR-576739,Meerut Institute of Engineering and Technology,Meerut,IN,,,250005,Academic


In [26]:
hierarchy.head()

Unnamed: 0,CHILD_ECR,CHILD_NAME,CONSORTIUM,COUNTRY_CHILD,COUNTRY_PARENT,HIERARCHY_TYPE,HIER_LEVEL,PARENT_ECR,PARENT_NAME
0,ECR-10301592,Streamwood High School,NON-CONS,United States,United States,RINGGOLD,1,ECR-10398809,School District U-46
1,ECR-10365890,Hasbro Children's Hospital,NON-CONS,United States,United States,RINGGOLD,1,ECR-366613,Rhode Island Hospital
2,ECR-10382955,The University of Sydney Business School - Bur...,NON-CONS,Australia,Australia,RINGGOLD,1,ECR-30930330,The University of Sydney Business School
3,ECR-10196062,ARV Lady Gowrie Village,NON-CONS,Australia,Australia,RINGGOLD,1,ECR-10042153,Anglican Retirement Villages
4,ECR-10952406,Coty France,NON-CONS,France,United States,RINGGOLD,1,ECR-936525,Coty Inc


In [58]:
# Active Customers with Hierarchy

active_cust_hierarchy = pd.merge(ech[['ecrid', 'Classification']] , hierarchy, left_on='ecrid', right_on='PARENT_ECR', how='inner')

In [61]:
active_cust_hierarchy['PARENT_IS_CHILD'] = active_cust_hierarchy['PARENT_ECR'] == active_cust_hierarchy['CHILD_ECR']
print(active_cust_hierarchy.shape)
active_cust_hierarchy.head()

(601559, 12)


Unnamed: 0,ecrid,Classification,CHILD_ECR,CHILD_NAME,CONSORTIUM,COUNTRY_CHILD,COUNTRY_PARENT,HIERARCHY_TYPE,HIER_LEVEL,PARENT_ECR,PARENT_NAME,PARENT_IS_CHILD
0,ECR-29390542,Corporate,ECR-29390542,Twist Bioscience,NON-CONS,United States,United States,ELS,0,ECR-29390542,Twist Bioscience,True
1,ECR-10234417,Academic,ECR-10092350,University of Pennsylvania Division of Anatomi...,NON-CONS,United States,United States,RINGGOLD,1,ECR-10234417,University of Pennsylvania Department of Patho...,False
2,ECR-10234417,Academic,ECR-10358291,University of Pennsylvania Section of Surgical...,NON-CONS,United States,United States,RINGGOLD,2,ECR-10234417,University of Pennsylvania Department of Patho...,False
3,ECR-10234417,Academic,ECR-10234417,University of Pennsylvania Department of Patho...,NON-CONS,United States,United States,RINGGOLD,0,ECR-10234417,University of Pennsylvania Department of Patho...,True
4,ECR-576739,Academic,ECR-576739,Meerut Institute of Engineering and Technology,NON-CONS,India,India,ELS,0,ECR-576739,Meerut Institute of Engineering and Technology,True


In [77]:
# inspect Agreements

ech_df = active_cust_hierarchy.groupby(
    ['PARENT_ECR', 'PARENT_NAME', 'Classification', 'CONSORTIUM', 'COUNTRY_PARENT']
).agg(num_children=("CHILD_ECR", pd.Series.nunique),
      min_hierarchy=('HIER_LEVEL', min),
      max_hierarchy=('HIER_LEVEL', max)
    ).sort_values('max_hierarchy', ascending=False)

In [84]:
ech_df = ech_df.reset_index()

In [85]:
ech_df['PARENT_ECR'].value_counts()

ECR-527696      2
ECR-10241426    1
ECR-28177531    1
ECR-959693      1
ECR-10433389    1
               ..
ECR-501567      1
ECR-1205276     1
ECR-31069467    1
ECR-414707      1
ECR-30912491    1
Name: PARENT_ECR, Length: 95895, dtype: int64

## Account Assignment

In [None]:
# read data 
account_assignment = pd.read_hdf(path_to_hdf_datastore, 'Account_Assignment')

In [None]:
# map to ech 

## Churn Activities

In [86]:
churn_activities = pd.read_hdf(path_to_hdf_datastore, 'churn_activities')

In [87]:
churn_activities.head()

Unnamed: 0,Opportunity,Created By,Account ID,Company / Account,Contact,Lead,Priority,Activity Type,Task,Task/Event Record Type,...,Assigned,Date,Product Name,Assigned Role,Assigned Role Display,Created Date,Start,End,ECR Id,Parent ECR-ID
0,,Chinami Takebe,0010Y00001FvC5o,National Hospital Organization Hokkaido Medica...,???? Amano,,,Face to Face,0,Event,...,Chinami Takebe,2019-07-04T00:00,,HS-APAC-JP_N,,2019-07-04T00:00,2019-07-04T02:00,2019-07-04T02:30,ECR-593174,ECR-593174
1,SY_ELS_CK_2020_Mito Medical Center,Shiho Yakabe,0010Y00001FwK3c,Mito Medical Center,?? ??,,,,0,Event,...,Shiho Yakabe,2019-11-15T00:00,,HS-APAC-JP_C,,2019-12-04T00:00,2019-11-15T03:00,2019-11-15T04:00,ECR-202329,ECR-10122302
2,,David Lee,0010Y00001D1OCN,Yonsei University,???? ????,,,Virtual,0,Event,...,David Lee,2019-04-26T00:00,,A&G-APAC-KOREA-TAIWAN-CORE,,2019-04-29T00:00,2019-04-26T01:00,2019-04-26T02:00,ECR-137560,ECR-137560
3,DL_BF 19_Yonsei,David Lee,0010Y00001D1OCN,Yonsei University,???? ????,,,Face to Face,0,Event,...,David Lee,2019-04-17T00:00,,A&G-APAC-KOREA-TAIWAN-CORE,,2019-04-22T00:00,2019-04-17T06:00,2019-04-17T07:00,ECR-137560,ECR-137560
4,,Grace Kim,0013z00002LttHV,Daejeon Sun Hospital,???,,,Face to Face,0,Event,...,Grace Kim,2019-09-26T00:00,,RM-HS-APAC-KR,,2019-09-29T00:00,2019-09-26T02:30,2019-09-26T06:00,ECR-10325865,ECR-10325865


In [128]:
# Ignore product as its empty



In [108]:
churn_activities_agg = churn_activities.groupby(
    [churn_activities_ecrid_col, 'Activity Type', 'Assigned']
).agg(num_activities=("Task", 'count')
    ).sort_values('num_activities', ascending=False).reset_index()

In [119]:
churn_activities_agg = churn_activities_agg.replace('nan',np.NaN)

churn_activities_agg[churn_activities_ecrid_col].value_counts(dropna=False)

churn_activities_agg = churn_activities_agg[~churn_activities_agg[churn_activities_ecrid_col].isnull()]

In [122]:
churn_activities_agg['Activity Type'].value_counts(dropna=False)

NaN             30729
Virtual         20764
Face to Face    19130
Phone            5615
Other            5216
Online           2642
Email             327
Name: Activity Type, dtype: int64

In [123]:
churn_activities_agg.head()

Unnamed: 0,ECR Id,Activity Type,Assigned,num_activities
27,ECR-56966,,David Everard,286
33,ECR-1038939,Other,ConfigEnv Informatica,240
38,ECR-433882,,Kate Pollara,204
39,ECR-546135,,Jamie Bridewell,200
40,ECR-754766,,Jamie Bridewell,199


In [125]:
# Encode Activity Type and Assigned

25902

## Churn risks

In [129]:
churn_risks = pd.read_hdf(path_to_hdf_datastore, 'churn_risks')

In [130]:
churn_risks.head()

Unnamed: 0,Opportunity ID,Opportunity Name,Sales Type,Agreement Number,Account Name: ECR Id,Account Name: Account Name,Risk ID,Risk Name,Risk Type,Severity,Status,Created Date,Comments,Competitor: Account Name
0,0061v00000VNfYr,2020 SDCE and titles renewal,Renewal,1-18023895783,ECR-461283,BioMarin Pharmaceutical Inc,a011v000014QTnk,HC1 Low,Other,Low,Open,2019-05-30,Account Safe,
1,0060Y00000QDakg,1-4RP8CCF,Renewal,1-12424563636,ECR-25669,Orleans University,,,,,,NaT,,
2,0060Y00000QEMR8,2019 SDOL World Kimchi Institute-KESLI,Renewal,1-16198730319,ECR-931577,World Kimchi Institute,,,,,,NaT,,
3,0060Y00000QELhM,1-6YFPY9G,Renewal,1-16271681969,ECR-447334,Shijiazhuang Tiedao University,,,,,,NaT,,
4,0060Y00000QEOdV,SHIRP - JC 2019,Renewal,1-16123182910,ECR-929803,Saskatchewan Health Information Resources Part...,,,,,,NaT,,


In [132]:
churn_risks['Severity'].value_counts(dropna=False)

NaN         235965
Low          10506
High          3902
Medium        3169
Critical       406
Very Low        10
Name: Severity, dtype: int64

In [135]:
churn_risks['Agreement Number'].isnull().value_counts()

False    154208
True      99750
Name: Agreement Number, dtype: int64

In [136]:
churn_risks[churn_risks_ecrid_col].isnull().value_counts()

False    253932
True         26
Name: Account Name: ECR Id, dtype: int64

In [141]:
churn_risks.columns

Index(['Opportunity ID', 'Opportunity Name', 'Sales Type', 'Agreement Number',
       'Account Name: ECR Id', 'Account Name: Account Name', 'Risk ID',
       'Risk Name', 'Risk Type', 'Severity', 'Status', 'Created Date',
       'Comments', 'Competitor: Account Name'],
      dtype='object')

In [142]:
churn_risks_agg = churn_risks.groupby(
    [churn_risks_ecrid_col]
).agg(num_risks=('Opportunity ID', 'count')
    ).sort_values('num_risks', ascending=False).reset_index()

In [143]:
churn_risks_agg.head()

Unnamed: 0,Account Name: ECR Id,num_risks
0,ECR-1040481,979
1,ECR-1018472,506
2,ECR-1038939,195
3,ECR-1828,182
4,ECR-739,148


## Interactions

In [145]:
interactions = pd.read_hdf(path_to_hdf_datastore, 'interaction')

In [154]:
interactions = interactions.replace('nan',np.NaN)
interactions['CREATED_TO_CLOSED_DAYS'] = interactions['CREATED_TO_CLOSED_DAYS'].astype(float)

In [155]:
interactions_agg = interactions.groupby(
    [interactions_ecrid_col]
).agg(num_incidents=('INCIDENT_ID', 'count'),
      mean_days_to_close=('CREATED_TO_CLOSED_DAYS', 'mean'),
      max_days_to_close=('CREATED_TO_CLOSED_DAYS', max)
    ).sort_values('num_incidents', ascending=False).reset_index()

In [156]:
interactions_agg.head()

Unnamed: 0,ECR_ID,num_incidents,mean_days_to_close,max_days_to_close
0,ECR-10018297,14281,0.52,84.84
1,ECR-10257631,12423,1.58,68.24
2,ECR-10274093,5953,4.71,283.43
3,ECR-10222917,5791,0.0,0.77
4,ECR-32936959,4510,3.26,370.29


## NPS

In [157]:
nps = pd.read_hdf(path_to_hdf_datastore, 'NPS_Cleansed_Data')

In [160]:
nps.head()

Unnamed: 0,ECR_ID,RESPONSEID,WAVE,DATE_OF_INTERVIEW,ORG_NAME,COUNTRY,ORGANIZATION,PRODUCT_NAME_ROLLUP,PRODUCT_DETAIL,JOB_ROLE,...,COMPETITOR_NAME,DEPARTMENT,INFLUENCE,CSAT,CSAT_COMMENT,NPS_SCORE,NPS_COMMENT,AT_RISK,VALUE_FOR_MONEY_SCORE,SHARE_WITH_CUST_DETAILS
0,ECR-635,1362709,42.0,2015-01-29,Cornell University,United States,University,,,Other,...,,Sales,No involvement,Satisfied,,0.0,prefer not to answer because I donot usually m...,At Risk - Normal,,NO
1,ECR-0,1378062,42.0,2015-02-26,,Germany,University,,,Librarian,...,,Online Customer Services,,Very satisfied,No comment,0.0,pricing policy,At Risk - Normal,,NO
2,ECR-0,1380278,42.0,2015-02-22,,France,College,,,Student,...,,Online Customer Services,,Dissatisfied,"""Je pense que vous devez changer complÃ©tement...",0.0,"My previous message explains it clearly, the p...",At Risk - Red Flag,,YES
3,ECR-1080234,1381630,42.0,2015-03-02,Miltenyi Biotec,United States,Commercial/Corporate,,,Other,...,,Sales,Make recommendation,Neutral,,0.0,Have not been contacted as part of a follow up...,At Risk - Normal,Don't know/ NA,YES
4,ECR-0,1394468,42.0,2015-03-14,,France,University,,,Researcher/scientist,...,,Online Customer Services,,Don't know,No comment,0.0,no answer,At Risk - Normal,,YES


In [166]:
nps_agg = nps.groupby(
    [NPS_ecrid_col]
).agg(mean_nps=('NPS_SCORE', 'mean')
    ).sort_values('mean_nps', ascending=False).reset_index()

In [165]:
nps_agg.head()

Unnamed: 0,ECR_ID,mean_nps
0,ECR-1111196,0.0
1,ECR-1138775,0.0
2,ECR-22037,0.0
3,ECR-475388,0.0
4,ECR-429473,0.0


(8464, 2)

## AA USAGE

In [169]:
usage = pd.read_hdf(path_to_hdf_datastore, 'usage')

In [170]:
usage.head()

Unnamed: 0,ACT_CLICK_DEPTH,ACT_DWELL_TIME_VISIT_MIN,ECR_ID,LOY_DWELL_TIME_USER_MIN,LOY_RETURN_RATE,POP_ACTIVE_USERS,POP_PAGE_VIEWS,POP_TIME_SPENT_HRS,POP_VISITS,PROD_NAME,REPORT_AGG,REPORT_DT
0,0.0,0.0,ECR-1003,0.0,0.0,0,0,0.0,0,SCOPUS,MONTH,2016-09-01
1,0.0,0.0,ECR-1003,0.0,0.0,0,0,0.0,0,SCOPUS,MONTH,2017-05-01
2,0.0,0.003968253968,ECR-1003,0.004166666667,1.05,60,0,0.0,63,SCOPUS,MONTH,2018-11-01
3,4.731138546,4.558527663466,ECR-1003,6.354047163799,1.39,523,3449,55.39,729,SCOPUS,MONTH,2019-06-01
4,0.0,0.0,ECR-10036481,0.0,0.0,0,0,0.0,0,SCOPUS,MONTH,2016-08-01


In [173]:
usage_agg = usage.groupby(
    [usage_ecrid_col, 'PROD_NAME']
).agg(mean_time_spent_hrs=('POP_TIME_SPENT_HRS', 'mean'),
      mean_visits=('POP_VISITS', 'mean')
    ).sort_values('mean_time_spent_hrs', ascending=False).reset_index()

In [174]:
usage_agg.head()

Unnamed: 0,ECR_ID,PROD_NAME,mean_time_spent_hrs,mean_visits
0,,SCOPUS,56489.56,770314.58
1,ECR-248356,SCIENCEDIRECT,12111.84,72574.76
2,ECR-77785,SCIENCEDIRECT,10420.33,72540.16
3,ECR-411616,SCIENCEDIRECT,9160.73,49278.98
4,ECR-325699,SCIENCEDIRECT,8866.31,63859.27


In [61]:
sis_ecr_mapping = pd.read_hdf(path_to_hdf_datastore, 'sis_mapping')

In [63]:
sis_ecr_mapping.head(20)

Unnamed: 0,ACCOUNT_NAME,SIS_ID,HQ_SIS_ID,OLD_SIS_ID,OLD_HQ_SIS_ID,CRM_ID,CRM_HQ_ID
0,Krankenhaus fuer Naturheilwesen - Muenchen,586801,586801,586801,586801,586801,586801
1,Fareva Holdings,ECR-574618,ECR-574618,574618,574618,ECR-574618,ECR-574618
2,Uk Inst & Fac Actuaries\Miscellaneous Addresses,587168,ECR-587167,587168,587167,587168,ECR-587167
3,Coll Santa Barbara Bus-Santa Barbara\Coll Sant...,580306,ECR-10040821,580306,580301,580306,580301
4,Global Nuclear Fuel,586936,ECR-65048,586936,182199,586936,182199
5,Nurture Your Birth\Miscellaneous Addresses,609109,609108,609109,609108,609109,609108
6,Inst Catalan Oncology\Miscellaneous Addresses,546916,ECR-10216214,546916,546915,546916,ECR-10216214
7,"Library, Bae Systems Land Systems",603044,ECR-114621,603044,114621,603044,ECR-114621
8,Med Ctr Kawaguchi Municipal\Miscellaneous Addr...,468319,ECR-468318,468319,468318,468319,ECR-468318
9,Japanese Agency Environment\Miscellaneous Addr...,65825,ECR-65824,65825,65824,65825,ECR-65824
