In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from pandas import HDFStore

import yaml

cfg = None
    #' load config.yaml file in the root dir 
with open("../config.yaml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
    
# Use 3 decimal places in output display
pd.set_option("display.precision", 3)

# Don't wrap repr(DataFrame) across additional lines
pd.set_option("display.expand_frame_repr", False)

# Set max rows displayed in output to 25
pd.set_option("display.max_rows", 25)

pd.set_option('display.float_format', lambda x: '%.2f' % x)


#Function to get data frame summary
def get_data_frame_summmary(data_frame, sort_by='percent_missing', ascending=False):
    unique_values = data_frame.apply(lambda x: [x.unique()])
    unique_counts = data_frame.apply(lambda x: len(x.unique()))
    percent_missing = data_frame.apply(lambda x: sum(pd.isnull(x))/len(x)*100)
    data_type = data_frame.dtypes

    return pd.DataFrame(dict(unique_values = unique_values, 
                                unique_counts = unique_counts,
                                data_type = data_type,
                                percent_missing = percent_missing,
                                )).reset_index().sort_values(by=sort_by, ascending=ascending)

# Function to drop outliers of numeric columns
def drop_outliers(data_frame, exclude=[]):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    num_cols = data_frame.select_dtypes(include=numerics).columns

    num_cols = num_cols.difference(exclude)
    
    for colname in num_cols:
        upper_lim = data_frame[colname].quantile(.95)
        lower_lim = data_frame[colname].quantile(.05)
        
        print(f'Dropping outliers for {colname} upper limit = {upper_lim} and lower limit = {lower_lim}')
        data_frame = data_frame[(data_frame[colname] < upper_lim) & (data_frame[colname] > lower_lim)]
      
    return data_frame


# Function to drop outliers of numeric columns
def scale_numeric_features(data_frame, exclude=[], 
                           method='standardize',
                           inplace=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    num_cols = data_frame.select_dtypes(include=numerics).columns

    num_cols = num_cols.difference(exclude)
    print(f'********************* - Scaling following {len(num_cols)} features - **********************')
    for colname in num_cols:
        new_colname = colname if inplace else colname+'_'+method+'d'
        print(f' {colname} {method}d to {new_colname}')
        
        if method == 'standardize':              
              data_frame[new_colname] = (data_frame[colname] - data_frame[colname].mean()) / data_frame[colname].std()
        elif method == 'normalize':
              data_frame[new_colname] = (data_frame[colname] - data_frame[colname].min()) / (data_frame[colname].max() - data_frame[colname].min())
        else:
              print(f'Unknown method {method} specified, please select one of "standardize" or "normalize"')

      
    return data_frame


def remove_items(iteamlist, removelist):
    return [ele for ele in iteamlist if ele not in removelist]

def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]
    
def months_between_columns(date_series1, date_series2):
    return round((date_series1-date_series2) / np.timedelta64(1, 'M'))
    
path_to_hdf_datastore = cfg['path_to_hdf_datastore']
path_to_clean_hdf_datastore = cfg['path_to_clean_hdf_datastore']

In [2]:
store = HDFStore(path_to_hdf_datastore)
print(store.keys())
store.close()

['/Account_Assignment', '/DataCR_from_2015_Cancellations_Source_Systems', '/DataCRjournals', '/DataCRother', '/ECH_Customer_Data', '/NPS_Cleansed_Data', '/Product_Assignment_2019', '/SIS_ECRID_Mapping', '/churn_activities', '/churn_products', '/churn_risks_V02', '/churn_vps', '/hierarchy', '/interaction', '/sd_fact_cntr-2019-01-0000_part_00', '/sis_mapping', '/usage']


In [3]:
# Read Data
other_products_contracts = pd.read_hdf(path_to_hdf_datastore, cfg['other_contracts_file'])

df = other_products_contracts
del other_products_contracts

df.head()

Unnamed: 0,Agreement End Date,Agreement Number,Agreement Start Date,Bookigns - Committed Print(Rep),Bookings - Final Net Price - Agent Discount Amount(Rep),Business Division (Agreement SIS),Business Indicator,Calculated New/Renewal,Country Name (Agreement SIS),Division,...,Saleable Product Name (Source),Sales Division (Agreement SIS),Sales Type,Status,Status Change Date,Subregion Grouping,Subscription End Date,Subscription Start Date,Subscription Start Year,WIP Flag
0,2015-03-18,DLT1,2015-03-18,0,0.0,A&G_NOAM,,New,United States,HS,...,Clinical Chemistry,AG-NOAM-CAN-NEW-WEST0-15D,,N.A,0-00-00,Educational,2015-03-18,2015-03-18,2015,INVOICED
1,2015-03-18,DLT1,2015-03-18,0,0.0,A&G_NOAM,,New,United States,HS,...,Rodak's Hematology,AG-NOAM-CAN-NEW-WEST0-15D,,N.A,0-00-00,Educational,2015-03-18,2015-03-18,2015,INVOICED
2,2015-04-10,DLT1,2015-04-10,0,0.0,A&G_NOAM,,New,United States,HS,...,Linne & Ringsrud's Clinical Laboratory Science,AG-NOAM-CAN-NEW-WEST0-15D,,N.A,0-00-00,Educational,2015-04-10,2015-04-10,2015,INVOICED
3,2015-07-28,DLT1,2015-07-28,0,0.0,A&G_NOAM,,New,United States,HS,...,Echocardiography for the Neonatologist,AG-NOAM-CAN-NEW-WEST0-15D,,N.A,0-00-00,Educational,2015-07-28,2015-07-28,2015,INVOICED
4,2015-03-03,DLT1,2015-03-03,0,0.0,A&G_NOAM,,New,United States,HS,...,Epidemiology,AG-NOAM-CAN-NEW-WEST0-15D,,N.A,0-00-00,Educational,2015-03-03,2015-03-03,2015,INVOICED


# Replace strings indicating missing data with null

In [4]:
#replace strings indicating missing data with null
df.replace(['nan', 'N.A', 'N.A.', 'NaN', 'Nan', '00-00-00', '0-00-00'], np.nan, inplace=True)
df

Unnamed: 0,Agreement End Date,Agreement Number,Agreement Start Date,Bookigns - Committed Print(Rep),Bookings - Final Net Price - Agent Discount Amount(Rep),Business Division (Agreement SIS),Business Indicator,Calculated New/Renewal,Country Name (Agreement SIS),Division,...,Saleable Product Name (Source),Sales Division (Agreement SIS),Sales Type,Status,Status Change Date,Subregion Grouping,Subscription End Date,Subscription Start Date,Subscription Start Year,WIP Flag
0,2015-03-18,DLT1,2015-03-18,0,0.00,A&G_NOAM,,New,United States,HS,...,Clinical Chemistry,AG-NOAM-CAN-NEW-WEST0-15D,,,,Educational,2015-03-18,2015-03-18,2015,INVOICED
1,2015-03-18,DLT1,2015-03-18,0,0.00,A&G_NOAM,,New,United States,HS,...,Rodak's Hematology,AG-NOAM-CAN-NEW-WEST0-15D,,,,Educational,2015-03-18,2015-03-18,2015,INVOICED
2,2015-04-10,DLT1,2015-04-10,0,0.00,A&G_NOAM,,New,United States,HS,...,Linne & Ringsrud's Clinical Laboratory Science,AG-NOAM-CAN-NEW-WEST0-15D,,,,Educational,2015-04-10,2015-04-10,2015,INVOICED
3,2015-07-28,DLT1,2015-07-28,0,0.00,A&G_NOAM,,New,United States,HS,...,Echocardiography for the Neonatologist,AG-NOAM-CAN-NEW-WEST0-15D,,,,Educational,2015-07-28,2015-07-28,2015,INVOICED
4,2015-03-03,DLT1,2015-03-03,0,0.00,A&G_NOAM,,New,United States,HS,...,Epidemiology,AG-NOAM-CAN-NEW-WEST0-15D,,,,Educational,2015-03-03,2015-03-03,2015,INVOICED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10903058,2022-07-31,1-17619408550,2019-08-01,0,2226.42,A&G_NOAM,Renewal,Renewal,United States,A&G,...,Sustainable Energy & Development - Academic,AG-NOAM-CAN-NEW-WEST0-18D,Active Renewal,Complete,2019-07-29,Region Canada New West,2021-07-31,2020-08-01,2020,WIP
10903059,2022-07-31,1-17619408550,2019-08-01,0,3041.39,A&G_NOAM,Renewal,Renewal,United States,A&G,...,Transportation Engineering - Academic,AG-NOAM-CAN-NEW-WEST0-18D,Active Renewal,Complete,2019-07-29,Region Canada New West,2021-07-31,2020-08-01,2020,WIP
10903060,2021-09-30,1-17743442609,2018-10-01,0,53405.00,A&G_NOAM,Renewal,Renewal,United States,A&G,...,DC & EGS Combo - Universities,AG-NOAM-CAN-NEW-WEST0-18D,Standard Renewal,Complete,2019-08-14,Region Canada New West,2021-09-30,2020-10-01,2020,WIP
10903061,2020-12-31,1-18231914317,2020-01-01,0,49394.00,A&G_NOAM,Renewal,Renewal,United States,A&G,...,Inspec,AG-NOAM-CAN-NEW-WEST0-18D,Active Renewal,With Customer,2019-12-30,Region Canada New West,2020-12-31,2020-01-01,2020,WIP


In [5]:
for i in range(0,df.shape[1]):
    print(df.iloc[:,i].value_counts())

2016-12-31    59329
2018-12-31    52852
2017-12-31    51543
2015-12-31    50983
2019-12-31    47101
              ...  
2015-08-15        1
2015-01-10        1
2021-04-06        1
2024-12-23        1
2024-01-31        1
Name: Agreement End Date, Length: 2999, dtype: int64
DLT1             9221587
DLF1               39553
UPL-5               9654
1-17213586937       3301
1-13806942606       2788
                  ...   
1-11518637494          1
20068                  1
1-12895864019          1
1-15537761693          1
1-15997341046          1
Name: Agreement Number, Length: 169061, dtype: int64
2016-01-01    66666
2015-01-01    48526
2017-01-01    46828
2018-01-01    45675
2019-01-01    41809
              ...  
2020-12-28        1
2020-01-05        1
2014-08-05        1
2020-05-30        1
2013-10-30        1
Name: Agreement Start Date, Length: 2277, dtype: int64
0    10903063
Name: Bookigns - Committed Print(Rep), dtype: int64
0.00       1464176
0.01         41196
0.02         31146
0

2020-11-30    16979
2019-12-31     7336
2020-02-29     5095
2020-10-30     4583
2020-06-30     4345
              ...  
2015-03-19        1
2015-09-09        1
2021-08-01        1
2018-03-19        1
2020-12-16        1
Name: Renewal Exp Complete Date, Length: 2007, dtype: int64
ECR-576965      570502
ECR-79671       523755
ECR-953255      442650
ECR-10297731    408681
ECR-973351      225118
                 ...  
543063               1
ECR-415189           1
494305               1
ECR-21212049         1
346672               1
Name: SIS Id  (Agreement SIS), Length: 34091, dtype: int64
Medical-Surgical Nursing                         45202
Fundamentals of Nursing                          33099
PART - Evolve Universal Single Access Card       31639
Physical Examination and Health Assessment       28069
Nursing Diagnosis Handbook                       27275
                                                 ...  
DIETETIQUE POUR L'AS ET L'AP POD                     1
Cust: Fall16 RES 130 e-

### Summary of the dataframe

In [6]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(get_data_frame_summmary(df))

Unnamed: 0,index,unique_values,unique_counts,data_type,percent_missing
14,Parent Agreement Number,"[[nan, 1-7742179017, 1-7374363223, 1-539663949...",91170,object,96.58
24,Renewal Exp Complete Date,"[[nan, 2016-02-12, 2016-03-10, 2016-01-04, 201...",2008,object,96.22
16,Payment Term Description,"[[nan, Net Due in 30 days After Invoice Date, ...",102,object,93.03
17,Payment Term Type,"[[nan, NET, SPL/MTH, AS_IS_DATE]]",4,object,93.03
15,Payment Term,"[[nan, 30 NET, 60 NET, SPL4/MTH12, 120 NET, 15...",105,object,93.03
30,Status Change Date,"[[nan, 2015-05-04, 2015-02-12, 2015-03-10, 201...",1549,object,92.81
29,Status,"[[nan, Complete, Cancelled Complete, Merged Co...",16,object,92.72
28,Sales Type,"[[nan, PF, LF, GR, Cancelled, Evergreen Renewa...",13,object,92.59
6,Business Indicator,"[[nan, New Sale, Renewal, Cancelled, Credit, N...",7,object,84.94
1,Agreement Number,"[[DLT1, DLF1, UPL-5, 1-8185280178, 1-826917069...",169062,object,7.13


# Identify Missing Data

Missing values affect the performance of the machine learning models and its useful to identify and drop or impute missing values before modelling.

We will quantify the missing data and drop any columns lower than the given threshold. we have set threshold of 70% so any columns and rows with missing data over 70% are dropped.

In [8]:
#set threshold
threshold = 0.7


#Dropping columns with missing value rate higher than threshold
temp = df[df.columns[df.isnull().mean() < threshold]]

print('-------------------- Dropping Columns with missing data --------------------')
print(f'Following {len(df.columns.difference(temp.columns))} columns have missing data over the threshold and will be removed')
print(df.columns.difference(temp.columns))

df = temp

print('-------------------- Dropping rows with missing data --------------------')
print(f' There are {sum(df.isnull().mean(axis=1) > threshold)} rows with missing data over the threshold')

#Dropping rows with missing value rate higher than threshold
df = df.loc[df.isnull().mean(axis=1) < threshold]

# remove temp object from memory
del temp

#manually remove borderline case

-------------------- Dropping Columns with missing data --------------------
Following 0 columns have missing data over the threshold and will be removed
Index([], dtype='object')
-------------------- Dropping rows with missing data --------------------
 There are 0 rows with missing data over the threshold


In [9]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(get_data_frame_summmary(df))

Unnamed: 0,index,unique_values,unique_counts,data_type,percent_missing
1,Agreement Number,"[[DLT1, DLF1, UPL-5, 1-8185280178, 1-826917069...",169062,object,7.13
22,Subregion Grouping,"[[Educational, NOAM Trade-NOAM, HS United King...",129,object,5.42
8,Division,"[[HS, Corporate, nan, Global, A&G]]",5,object,5.42
18,RSO,"[[NOAM, EMEALA, APAC, nan, Global, MAEU, A&G V...",13,object,5.42
10,Invoice Date,"[[2015-03-18, 2015-04-10, 2015-07-28, 2015-03-...",1969,object,0.34
11,Invoice Num,"[[34784DA4, 34784DA5, 37676DA9, 51003DA1, 3289...",2815086,object,0.34
17,Product Revenue Type,"[[One Off, Recurring, nan, One Off/Recurring]]",4,object,0.01
20,Saleable Product Name (Source),"[[Clinical Chemistry, Rodak's Hematology, Linn...",82747,object,0.0
4,Bookings - Final Net Price - Agent Discount Am...,"[[0.0, 225.0, 247.5, 42.866599732, 45.54660227...",916996,float64,0.0
0,Agreement End Date,"[[2015-03-18, 2015-04-10, 2015-07-28, 2015-03-...",2999,object,0.0


### Data Imputation

We can impute missing data with meaningful data so that model development has good quality of data

We can impute categorical variables with the most frequestly occuring value and impute numerical variables with 0 or mean or median depending on the variable context.

In [11]:
#Filling all missing values with 0
# data = data.fillna(0)

#Filling missing values with the most frequest values

# Handling Outliers

Extreme value can skew the data distribution and thus affect the model development we identify outliers in numeric variables and handle them by removing or capping.

### Outlier Detection with Percentiles

In [12]:
#Dropping the outlier rows with Percentiles

df = drop_outliers(df, exclude = ['Bookigns - Committed Print(Rep)'])

Dropping outliers for Bookings - Final Net Price - Agent Discount Amount(Rep) upper limit = 1552.5 and lower limit = -208.5
Dropping outliers for Subscription Start Year upper limit = 2019.0 and lower limit = 2015.0


# Binning

The main motivation of binning is to make the model more robust and prevent overfitting, however, it has a cost to the performance. Every time you bin something, you sacrifice information and make your data more regularized

The trade-off between performance and overfitting is the key point of the binning process


For Categorical variables the labels with low frequencies probably affect the robustness of statistical models negatively. Thus, assigning a general category to these less frequent values helps to keep the robustness of the model.

it is a good option to unite the labels with a count less than 100 to a new category like “Other”.

In [13]:
binning_summary = get_data_frame_summmary(df, 
                                    sort_by='unique_counts', 
                                    ascending=True)   

### Columns with less than 100 unique categories

In [14]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(binning_summary[binning_summary['unique_counts'] < 100])
    

Unnamed: 0,index,unique_values,unique_counts,data_type,percent_missing
3,Bookigns - Committed Print(Rep),[[0]],1,int64,0.0
6,Calculated New/Renewal,"[[New, Renewal]]",2,object,0.0
25,Subscription Start Year,"[[2016, 2017, 2018]]",3,int64,0.0
17,Product Revenue Type,"[[One Off, Recurring, nan, One Off/Recurring]]",4,object,0.0
8,Division,"[[HS, Corporate, Global, nan, A&G]]",5,object,5.06
26,WIP Flag,"[[INVOICED, WIP, MERGED, WIP_PPL, DELETED, CNI]]",6,object,0.0
5,Business Division (Agreement SIS),"[[A&G_NOAM, Corporate, HS EMEA/LA, A&G_MAEU, C...",9,object,0.0
18,RSO,"[[NOAM, EMEALA, APAC, Corporate Vendor, Global...",10,object,5.06
13,Product Line Level 1,"[[Books, NHP eSolutions, Performance Solutions...",15,object,0.0


In [15]:
columns_to_bin = binning_summary[binning_summary['unique_counts'] < 100]['index'].to_list()

In [16]:
# select columns with less than 100 labels
df = df.apply(lambda x: x.mask(x.map(x.value_counts())<10, 'Other') if x.name in columns_to_bin else x)

### After Binning - Columns with less than 100 unique categories

In [17]:
binning_summary = get_data_frame_summmary(df, 
                                    sort_by='unique_counts', 
                                    ascending=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(binning_summary[binning_summary['unique_counts'] < 100])

Unnamed: 0,index,unique_values,unique_counts,data_type,percent_missing
3,Bookigns - Committed Print(Rep),[[0]],1,int64,0.0
6,Calculated New/Renewal,"[[New, Renewal]]",2,object,0.0
25,Subscription Start Year,"[[2016, 2017, 2018]]",3,int64,0.0
17,Product Revenue Type,"[[One Off, Recurring, nan, Other]]",4,object,0.0
8,Division,"[[HS, Corporate, Global, nan, A&G]]",5,object,5.06
26,WIP Flag,"[[INVOICED, WIP, MERGED, WIP_PPL, DELETED, Oth...",6,object,0.0
5,Business Division (Agreement SIS),"[[A&G_NOAM, Corporate, HS EMEA/LA, A&G_MAEU, C...",9,object,0.0
18,RSO,"[[NOAM, EMEALA, APAC, Corporate Vendor, Global...",10,object,5.06
13,Product Line Level 1,"[[Books, NHP eSolutions, Performance Solutions...",15,object,0.0


## Log Transform

Logarithm transformation (or log transform) is one of the most commonly used mathematical transformations

It helps to handle skewed data and after transformation, the distribution becomes more approximate to normal.
In most of the cases the magnitude order of the data changes within the range of the data. log transform normalizes the magnitude differences.

It also decreases the effect of the outliers, due to the normalization of magnitude differences and the model become more robust.

Important note: The data you apply log transform must have only positive values, otherwise you receive an error. Also, you can add 1 to your data before transform it. Thus, you ensure the output of the transformation to be positive.


In [18]:
# No Log transform applied 

## Scaling

In most cases, the numerical features of the dataset do not have a certain range and they differ from each other. Scaling solves this problem. The continuous features become identical in terms of the range, after a scaling process. This process is not mandatory for many algorithms, but it might be still nice to apply. 

### Normalization

Normalization (or min-max normalization) scale all values in a fixed range between 0 and 1. This transformation does not change the distribution of the feature and due to the decreased standard deviations, the effects of the outliers increases. Therefore, before normalization, it is recommended to handle the outliers

### Standardization
Standardization (or z-score normalization) scales the values while taking into account standard deviation. If the standard deviation of features is different, their range also would differ from each other. This reduces the effect of the outliers in the features.
In the following formula of standardization, the mean is shown as μ and the standard deviation is shown as σ.

In [19]:
df = scale_numeric_features(df, inplace=True)

********************* - Scaling following 3 features - **********************
 Bookigns - Committed Print(Rep) standardized to Bookigns - Committed Print(Rep)
 Bookings - Final Net Price - Agent Discount Amount(Rep) standardized to Bookings - Final Net Price - Agent Discount Amount(Rep)
 Subscription Start Year standardized to Subscription Start Year


## One-hot encoding

This method spreads the values in a column to multiple flag columns and assigns 0 or 1 to them. These binary values express the relationship between grouped and encoded column.

This method changes your categorical data, which is challenging to understand for algorithms, to a numerical format

If you have N distinct values in the column, it is enough to map them to N-1, as the missing value can be deduced from the other columns



In [20]:
print(columns_to_bin)

['Bookigns - Committed Print(Rep)', 'Calculated New/Renewal', 'Subscription Start Year', 'Product Revenue Type', 'Division', 'WIP Flag', 'Business Division (Agreement SIS)', 'RSO', 'Product Line Level 1']


In [21]:
# Specify Columns to encode
columns_to_exclude = ['Product Line Level 3','Payment Term Description','Payment Term', 
                      'Business Division (Agreement SIS)', 'Product Line Level 4']
columns_to_include = columns_to_bin
columns_to_encode = remove_items(columns_to_include, columns_to_exclude)
columns_to_encode

['Bookigns - Committed Print(Rep)',
 'Calculated New/Renewal',
 'Subscription Start Year',
 'Product Revenue Type',
 'Division',
 'WIP Flag',
 'RSO',
 'Product Line Level 1']

In [22]:
for column in columns_to_encode:
    encoded_columns = pd.get_dummies(df[column])
    print(f'Encoding collumns : {column} to {len(encoded_columns.columns)} new encoded columns')
    df = df.join(encoded_columns, rsuffix='_'+column).drop(column, axis=1)

Encoding collumns : Bookigns - Committed Print(Rep) to 0 new encoded columns
Encoding collumns : Calculated New/Renewal to 2 new encoded columns
Encoding collumns : Subscription Start Year to 3 new encoded columns
Encoding collumns : Product Revenue Type to 3 new encoded columns
Encoding collumns : Division to 4 new encoded columns
Encoding collumns : WIP Flag to 6 new encoded columns
Encoding collumns : RSO to 9 new encoded columns
Encoding collumns : Product Line Level 1 to 15 new encoded columns


In [23]:
df.shape

(6062961, 61)

In [24]:
df.head()

Unnamed: 0,Agreement End Date,Agreement Number,Agreement Start Date,Bookings - Final Net Price - Agent Discount Amount(Rep),Business Division (Agreement SIS),Country Name (Agreement SIS),HQ SIS Id (Agreement SIS),Invoice Date,Invoice Num,Name (Agreement SIS),...,Licensing,Life Sciences,NHP eSolutions,Patient Engagement,Performance Solutions,Reference,Research Intelligence,Research Workflow Solutions,Unspecified,Workflow
2179432,2016-10-14,DLT1,2016-10-14,-0.1,A&G_NOAM,United States,1000,2016-10-14,24894DC3,Miscellaneous Addresses,...,0,0,0,0,0,0,0,0,0,0
2179433,2016-10-14,DLT1,2016-10-14,-0.13,A&G_NOAM,United States,1000,2016-10-14,24894DC3,Miscellaneous Addresses,...,0,0,0,0,0,0,0,0,0,0
2179434,2016-12-20,DLT1,2016-12-20,0.37,A&G_NOAM,United States,1000,2016-12-20,32661DC0,Miscellaneous Addresses,...,0,0,0,0,0,0,0,0,0,0
2179435,2016-03-08,DLT1,2016-03-08,-0.47,A&G_NOAM,United States,1004,2016-03-08,84697DA9,University of California - San Diego La Jolla,...,0,0,0,0,0,0,0,0,0,0
2179436,2016-09-12,DLT1,2016-09-12,-0.47,A&G_NOAM,United States,1004,2016-09-12,19585DC3,University of California - San Diego La Jolla,...,0,0,0,0,0,0,0,0,0,0


## Transform Date columns

Recency and Frequency Variables and length / duration.


In [27]:
# convert date columns to date data type

date_columns = ['Agreement Start Date', 'Agreement End Date','Subscription Start Date', 'Subscription End Date','Invoice Date']

In [28]:
for column in date_columns:
    df[column] = pd.to_datetime(df[column])

In [29]:
# calcualte new variable subscription_length and agreement_length
df['subscription_length'] = months_between_columns(df['Subscription End Date'],df['Subscription Start Date'])

In [30]:
df['agreement_length'] = months_between_columns(df['Agreement End Date'],df['Agreement Start Date'])

In [31]:
 df.head()

Unnamed: 0,Agreement End Date,Agreement Number,Agreement Start Date,Bookings - Final Net Price - Agent Discount Amount(Rep),Business Division (Agreement SIS),Country Name (Agreement SIS),HQ SIS Id (Agreement SIS),Invoice Date,Invoice Num,Name (Agreement SIS),...,NHP eSolutions,Patient Engagement,Performance Solutions,Reference,Research Intelligence,Research Workflow Solutions,Unspecified,Workflow,subscription_length,agreement_length
2179432,2016-10-14,DLT1,2016-10-14,-0.1,A&G_NOAM,United States,1000,2016-10-14,24894DC3,Miscellaneous Addresses,...,0,0,0,0,0,0,0,0,0.0,0.0
2179433,2016-10-14,DLT1,2016-10-14,-0.13,A&G_NOAM,United States,1000,2016-10-14,24894DC3,Miscellaneous Addresses,...,0,0,0,0,0,0,0,0,0.0,0.0
2179434,2016-12-20,DLT1,2016-12-20,0.37,A&G_NOAM,United States,1000,2016-12-20,32661DC0,Miscellaneous Addresses,...,0,0,0,0,0,0,0,0,0.0,0.0
2179435,2016-03-08,DLT1,2016-03-08,-0.47,A&G_NOAM,United States,1004,2016-03-08,84697DA9,University of California - San Diego La Jolla,...,0,0,0,0,0,0,0,0,0.0,0.0
2179436,2016-09-12,DLT1,2016-09-12,-0.47,A&G_NOAM,United States,1004,2016-09-12,19585DC3,University of California - San Diego La Jolla,...,0,0,0,0,0,0,0,0,0.0,0.0


## Remove unused and/or redundant Features

Unused features are those that don’t make sense to pass into our machine learning algorithms. such as ID columns
Features that wouldn't be available at the time of prediction, Other text descriptions

Redundant features would typically be those that have been replaced by other features that you’ve added during feature engineering


In [34]:
# @TODO
# check the columns use of 'Status Change Date', 'Renewal Exp Complete Date'
# check 'Bookigns - Committed Print(Rep)'

columns_to_remove = ['HQ SIS Id (Agreement SIS)', 'Name  (Agreement SIS)', 'Invoice Num', 'Invoice Date', 'Product Line Level 3', 'Product Line Level 4']

In [35]:
df = df.drop(columns_to_remove, axis=1)

In [36]:
df.head()

Unnamed: 0,Agreement End Date,Agreement Number,Agreement Start Date,Bookings - Final Net Price - Agent Discount Amount(Rep),Business Division (Agreement SIS),Country Name (Agreement SIS),Product Line Level 2,SIS Id (Agreement SIS),Saleable Product Name (Source),Sales Division (Agreement SIS),...,NHP eSolutions,Patient Engagement,Performance Solutions,Reference,Research Intelligence,Research Workflow Solutions,Unspecified,Workflow,subscription_length,agreement_length
2179432,2016-10-14,DLT1,2016-10-14,-0.1,A&G_NOAM,United States,Nursing & Health Professionals P_ELP,1001,"Brinker, Piermattei and Flo's Handbook of Smal...",AG-NOAM-SOUTH00000000-01D,...,0,0,0,0,0,0,0,0,0.0,0.0
2179433,2016-10-14,DLT1,2016-10-14,-0.13,A&G_NOAM,United States,Nursing & Health Professionals P_ELP,1001,Piermattei's Atlas of Surgical Approaches to t...,AG-NOAM-SOUTH00000000-01D,...,0,0,0,0,0,0,0,0,0.0,0.0
2179434,2016-12-20,DLT1,2016-12-20,0.37,A&G_NOAM,United States,Nursing & Health Professionals P_ELP,1001,PART - Textbook of Veterinary Internal Medicin...,AG-NOAM-SOUTH00000000-01D,...,0,0,0,0,0,0,0,0,0.0,0.0
2179435,2016-03-08,DLT1,2016-03-08,-0.47,A&G_NOAM,United States,Research Reference P_ELP,1003,Applied Groundwater Modeling,AG-NOAM-CAN-NEW-WEST0-15D,...,0,0,0,0,0,0,0,0,0.0,0.0
2179436,2016-09-12,DLT1,2016-09-12,-0.47,A&G_NOAM,United States,Medical Education P_ELP,1003,Epidemiology,AG-NOAM-CAN-NEW-WEST0-15D,...,0,0,0,0,0,0,0,0,0.0,0.0


Save clean data to hdf store

In [38]:
store = HDFStore(path_to_clean_hdf_datastore)
df.loc[:, ['Agreement Number', 'Business Division (Agreement SIS)', 'Country Name (Agreement SIS)', 'Product Line Level 2', 'SIS Id  (Agreement SIS)', 'Saleable Product Name (Source)', 'Sales Division (Agreement SIS)', 'Subregion Grouping']] = df.loc[:, ['Agreement Number', 'Business Division (Agreement SIS)', 'Country Name (Agreement SIS)', 'Product Line Level 2', 'SIS Id  (Agreement SIS)', 'Saleable Product Name (Source)', 'Sales Division (Agreement SIS)', 'Subregion Grouping']].astype(str)
store.put(key=cfg['other_contracts_file']+"_clean", value=df)
store.close()


In [39]:
store = HDFStore(path_to_clean_hdf_datastore)
print(store.keys())
store.close()

['/Account_Assignment_clean', '/DataCR_from_2015_Cancellations_Source_Systems_clean', '/DataCRjournals_clean', '/DataCRother_clean', '/ECH_Customer_Data_clean', '/NPS_Cleansed_Data_clean', '/Product_Assignment_2019_clean', '/churn_activities_clean', '/churn_risks_V02_clean', '/interaction_clean', '/usage_clean']
