In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from pandas import HDFStore

import yaml

cfg = None
    #' load config.yaml file in the root dir 
with open("../config.yaml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
    
# Use 3 decimal places in output display
pd.set_option("display.precision", 3)

# Don't wrap repr(DataFrame) across additional lines
pd.set_option("display.expand_frame_repr", False)

# Set max rows displayed in output to 25
pd.set_option("display.max_rows", 25)

pd.set_option('display.float_format', lambda x: '%.2f' % x)


#Function to get data frame summary
def get_data_frame_summmary(data_frame, sort_by='percent_missing', ascending=False):
    unique_values = data_frame.apply(lambda x: [x.unique()])
    unique_counts = data_frame.apply(lambda x: len(x.unique()))
    percent_missing = data_frame.apply(lambda x: sum(pd.isnull(x))/len(x)*100)
    data_type = data_frame.dtypes

    return pd.DataFrame(dict(unique_values = unique_values, 
                                unique_counts = unique_counts,
                                data_type = data_type,
                                percent_missing = percent_missing,
                                )).reset_index().sort_values(by=sort_by, ascending=ascending)

# Function to drop outliers of numeric columns
def drop_outliers(data_frame, exclude=[]):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    num_cols = data_frame.select_dtypes(include=numerics).columns

    num_cols = num_cols.difference(exclude)
    
    for colname in num_cols:
        upper_lim = data_frame[colname].quantile(.95)
        lower_lim = data_frame[colname].quantile(.05)
        
        print(f'Dropping outliers for {colname} upper limit = {upper_lim} and lower limit = {lower_lim}')
        data_frame = data_frame[(data_frame[colname] < upper_lim) & (data_frame[colname] > lower_lim)]
      
    return data_frame


# Function to drop outliers of numeric columns
def scale_numeric_features(data_frame, exclude=[], 
                           method='standardize',
                           inplace=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    num_cols = data_frame.select_dtypes(include=numerics).columns

    num_cols = num_cols.difference(exclude)
    print(f'********************* - Scaling following {len(num_cols)} features - **********************')
    for colname in num_cols:
        new_colname = colname if inplace else colname+'_'+method+'d'
        print(f' {colname} {method}d to {new_colname}')
        
        if method == 'standardize':              
              data_frame[new_colname] = (data_frame[colname] - data_frame[colname].mean()) / data_frame[colname].std()
        elif method == 'normalize':
              data_frame[new_colname] = (data_frame[colname] - data_frame[colname].min()) / (data_frame[colname].max() - data_frame[colname].min())
        else:
              print(f'Unknown method {method} specified, please select one of "standardize" or "normalize"')

      
    return data_frame


def remove_items(iteamlist, removelist):
    return [ele for ele in iteamlist if ele not in removelist]

def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]
    
def months_between_columns(date_series1, date_series2):
    return round((date_series1-date_series2) / np.timedelta64(1, 'M'))
    
path_to_hdf_datastore = cfg['path_to_hdf_datastore']
path_to_clean_hdf_datastore = cfg['path_to_clean_hdf_datastore']

In [2]:
# Read Data
interaction = pd.read_hdf(path_to_hdf_datastore, cfg['interactions_file'])

df = interaction
del interaction

df.head()

Unnamed: 0,CHANNEL,CONTACT_COUNTRY,CONTACT_REASON_LVL1_DESC,CONTACT_REASON_LVL2_DESC,CONTACT_REASON_LVL3_DESC,CONTACT_TYPE,CREATED_TO_CLOSED_DAYS,CREATED_TO_INITIAL_RESPONSE_DAYS,CUSTOMER_CLASSIFICATION_PRODUCT,CUSTOMER_CLASSIFICATION_ROLE,...,INCIDENT_SYSTEM,NUMBER_OF_RESPONSES,OWNER_ID,OWNER_NAME,RESOLUTION_CODE_LVL1_DESC,RESOLUTION_CODE_LVL2_DESC,RESOLUTION_CODE_LVL3_DESC,SOURCE_LVL1_DESC,SOURCE_LVL2_DESC,STATUS
0,Phone,Afghanistan,Shipping and Delivery,Delivery status,No Value,No Value,0.0,,Book,Print,...,No Value,0,HPCS Books REPH 1LS,Maria Remedios Del Rosario,Resolution Code Not Found,No Value,No Value,CX Console,Contact Editor,Solved
1,Web,No Value,Author Profile Correction,Reprofile,No Value,No Value,21.6356,21.6356,Scopus,Content,...,No Value,1,AFT,DIvya Neelakantachar,Resolution Code Not Found,No Value,No Value,Public API,Connect Web Services - SOAP,Solved
2,CSS Email,United States,Login & Account,Retrieve username or reset password,No Value,No Value,5.89745,0.120428,Journal,Researcher,...,EES,3,RS Global SPI 1LS,Dain Cain Corminal,Used KB article,No Value,No Value,Utilities,Techmail - Service Mailbox,Solved
3,CSS Email,No Value,No Contact Reason,No Contact Reason,No Contact Reason,No Value,0.0,,No Value,Organization,...,No Value,0,No Agent,No Agent,No Resolution Code,No Resolution Code,No Resolution Code,Utilities,Techmail - Service Mailbox,Solved
4,Web,United States,Training,Resources,No Value,No Value,0.951667,0.161644,DirectCourse,Organization,...,No Value,2,CLCS STL 2LS,Jennifer Faron,Helped with registration,No Value,No Value,End-User pages,End-User Connect,Solved


# Replace strings indicating missing data with null

In [3]:
#replace strings indicating missing data with null
df.replace(['nan', 'N.A', 'N.A.', 'NaN', 'Nan', '00-00-00', '0-00-00', 'Unknown', 'No Value', 'No Agent', 'No Contact Reason', 'No Customer Classification', 'No Resolution Code'], np.nan, inplace=True)
df

Unnamed: 0,CHANNEL,CONTACT_COUNTRY,CONTACT_REASON_LVL1_DESC,CONTACT_REASON_LVL2_DESC,CONTACT_REASON_LVL3_DESC,CONTACT_TYPE,CREATED_TO_CLOSED_DAYS,CREATED_TO_INITIAL_RESPONSE_DAYS,CUSTOMER_CLASSIFICATION_PRODUCT,CUSTOMER_CLASSIFICATION_ROLE,...,INCIDENT_SYSTEM,NUMBER_OF_RESPONSES,OWNER_ID,OWNER_NAME,RESOLUTION_CODE_LVL1_DESC,RESOLUTION_CODE_LVL2_DESC,RESOLUTION_CODE_LVL3_DESC,SOURCE_LVL1_DESC,SOURCE_LVL2_DESC,STATUS
0,Phone,Afghanistan,Shipping and Delivery,Delivery status,,,0.0,,Book,Print,...,,0,HPCS Books REPH 1LS,Maria Remedios Del Rosario,Resolution Code Not Found,,,CX Console,Contact Editor,Solved
1,Web,,Author Profile Correction,Reprofile,,,21.6356,21.6356,Scopus,Content,...,,1,AFT,DIvya Neelakantachar,Resolution Code Not Found,,,Public API,Connect Web Services - SOAP,Solved
2,CSS Email,United States,Login & Account,Retrieve username or reset password,,,5.89745,0.120428,Journal,Researcher,...,EES,3,RS Global SPI 1LS,Dain Cain Corminal,Used KB article,,,Utilities,Techmail - Service Mailbox,Solved
3,CSS Email,,,,,,0.0,,,Organization,...,,0,,,,,,Utilities,Techmail - Service Mailbox,Solved
4,Web,United States,Training,Resources,,,0.951667,0.161644,DirectCourse,Organization,...,,2,CLCS STL 2LS,Jennifer Faron,Helped with registration,,,End-User pages,End-User Connect,Solved
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2402768,,,Author Profile Correction,Reprofile,,,3.52262,,Scopus,Content,...,,0,,,AFW tool,,,Public API,Connect Web Services - SOAP,Solved
2402769,CSS Email,,Review,Check paper status,,,0.6078939999999999,0.6078939999999999,Journal,Researcher,...,EES,1,GJP Journal Managers Chennai,Sudharshan Raj,Used KB article,,,Utilities,Techmail - Service Mailbox,Solved
2402770,Web,,Using the product,Error message,,,3.68934,2.74684,Mendeley Reference Manager,Organization,...,,1,RPCS REPH 1LS,Norelle Faye Tiri,Provided facilitator information,,,End-User pages,End-User Connect,Solved
2402771,Web,,Review,Report a concern,,,9.42491,9.42491,Journal,Researcher,...,Evise,1,RS Global SPI 1LS,Ivan Christian Ardio,Used KB article,,,End-User pages,End-User Connect,Solved


In [4]:
for i in range(0,df.shape[1]):
    print(df.iloc[:,i].value_counts())

CSS Email       1170666
Web              574002
Phone            237005
Chat             154985
Email            134231
Callback          16957
MA Email            957
Post                822
Facebook             89
Social Media         42
Fax                  35
Twitter              26
Name: CHANNEL, dtype: int64
United States                 366531
India                          75080
Italy                          59659
United Kingdom                 55418
Philippines                    39480
                               ...  
Lesotho                            1
Heard and McDonald Islands         1
Djibouti                           1
Mozambique                         1
Eritrea                            1
Name: CONTACT_COUNTRY, Length: 230, dtype: int64
Review                       409658
Author Profile Correction    390096
Ordering                     147432
Access                       125755
Submission                   120303
                              ...  
Journals Pro

Didn't use KB article                  583076
AFW tool                               333023
Used KB article                        151245
Provided product information/doc        85856
Sent to Editor to resolve               71442
                                        ...  
Updated user email address                  1
Retrieve username or reset password         1
Received agreement                          1
Registered for a free account               1
Used knowledgebase article                  1
Name: RESOLUTION_CODE_LVL1_DESC, Length: 344, dtype: int64
Series([], Name: RESOLUTION_CODE_LVL2_DESC, dtype: int64)
Series([], Name: RESOLUTION_CODE_LVL3_DESC, dtype: int64)
Utilities         1174309
Public API         524284
CX Console         396643
End-User pages     304927
Process Flow         2061
Web Console           539
Import Tools            6
Name: SOURCE_LVL1_DESC, dtype: int64
Techmail - Service Mailbox     1174309
Connect Web Services - SOAP     404272
End-User Connect      

### Summary of the dataframe

In [5]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(get_data_frame_summmary(df))

Unnamed: 0,index,unique_values,unique_counts,data_type,percent_missing
23,RESOLUTION_CODE_LVL3_DESC,[[nan]],1,float64,100.0
22,RESOLUTION_CODE_LVL2_DESC,[[nan]],1,float64,100.0
4,CONTACT_REASON_LVL3_DESC,"[[nan, Force close CPC resupply, Reset ABP wor...",161,object,99.8
5,CONTACT_TYPE,"[[nan, Individual (B2C), Employee of Academic ...",14,object,75.96
17,INCIDENT_SYSTEM,"[[nan, EES, Evise, Proof Central, PTS, Not App...",29,object,63.24
1,CONTACT_COUNTRY,"[[Afghanistan, nan, United States, Singapore, ...",231,object,58.29
7,CREATED_TO_INITIAL_RESPONSE_DAYS,"[[nan, 21.6356, 0.120428, 0.161644, 0.10648099...",336031,object,53.8
3,CONTACT_REASON_LVL2_DESC,"[[Delivery status, Reprofile, Retrieve usernam...",319,object,18.1
2,CONTACT_REASON_LVL1_DESC,"[[Shipping and Delivery, Author Profile Correc...",127,object,16.66
10,CUSTOMER_CLASSIFICATION_TYPE,"[[Purchaser, Other (customer), Reviewer, nan, ...",64,object,16.08


# Identify Missing Data

Missing values affect the performance of the machine learning models and its useful to identify and drop or impute missing values before modelling.

We will quantify the missing data and drop any columns lower than the given threshold. we have set threshold of 70% so any columns and rows with missing data over 70% are dropped.

In [6]:
#set threshold
threshold = 0.7


#Dropping columns with missing value rate higher than threshold
temp = df[df.columns[df.isnull().mean() < threshold]]

print('-------------------- Dropping Columns with missing data --------------------')
print(f'Following {len(df.columns.difference(temp.columns))} columns have missing data over the threshold and will be removed')
print(df.columns.difference(temp.columns))

df = temp

print('-------------------- Dropping rows with missing data --------------------')
print(f' There are {sum(df.isnull().mean(axis=1) > threshold)} rows with missing data over the threshold')

#Dropping rows with missing value rate higher than threshold
df = df.loc[df.isnull().mean(axis=1) < threshold]

# remove temp object from memory
del temp

#manually remove borderline case
df = df.drop(columns=[])

-------------------- Dropping Columns with missing data --------------------
Following 4 columns have missing data over the threshold and will be removed
Index(['CONTACT_REASON_LVL3_DESC', 'CONTACT_TYPE', 'RESOLUTION_CODE_LVL2_DESC',
       'RESOLUTION_CODE_LVL3_DESC'],
      dtype='object')
-------------------- Dropping rows with missing data --------------------
 There are 0 rows with missing data over the threshold


In [7]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(get_data_frame_summmary(df))

Unnamed: 0,index,unique_values,unique_counts,data_type,percent_missing
15,INCIDENT_SYSTEM,"[[nan, EES, Evise, Proof Central, PTS, Not App...",29,object,63.24
1,CONTACT_COUNTRY,"[[Afghanistan, nan, United States, Singapore, ...",231,object,58.29
5,CREATED_TO_INITIAL_RESPONSE_DAYS,"[[nan, 21.6356, 0.120428, 0.161644, 0.10648099...",336031,object,53.8
3,CONTACT_REASON_LVL2_DESC,"[[Delivery status, Reprofile, Retrieve usernam...",319,object,18.1
2,CONTACT_REASON_LVL1_DESC,"[[Shipping and Delivery, Author Profile Correc...",127,object,16.66
8,CUSTOMER_CLASSIFICATION_TYPE,"[[Purchaser, Other (customer), Reviewer, nan, ...",64,object,16.08
19,RESOLUTION_CODE_LVL1_DESC,"[[Resolution Code Not Found, Used KB article, ...",345,object,15.79
18,OWNER_NAME,"[[Maria Remedios Del Rosario, DIvya Neelakanta...",1864,object,14.7
17,OWNER_ID,"[[HPCS Books REPH 1LS, AFT, RS Global SPI 1LS,...",127,object,14.7
6,CUSTOMER_CLASSIFICATION_PRODUCT,"[[Book, Scopus, Journal, nan, DirectCourse, Ev...",229,object,10.59


### Data Imputation

We can impute missing data with meaningful data so that model development has good quality of data

We can impute categorical variables with the most frequestly occuring value and impute numerical variables with 0 or mean or median depending on the variable context.

INCIDENT_SYSTEM - Impute with most frequent

CONTACT_COUNTRY - Impute with most frequent

CONTACT_REASON_LVL2_DESC - Impute with most frequent

CONTACT_REASON_LVL1_DESC - Impute with most frequent

CUSTOMER_CLASSIFICATION_TYPE - Impute with most frequent

CUSTOMER_CLASSIFICATION_PRODUCT - Impute with most frequent

CREATED_TO_CLOSED_DAYS - Impute with median 

CHANNEL - Impute with most frequent

CUSTOMER_CLASSIFICATION_ROLE - Impute with most frequent


In [8]:
#Filling all missing values with 0
# data = data.fillna(0)

#Filling missing values with the most frequest values
df['INCIDENT_SYSTEM'].fillna(
    df['INCIDENT_SYSTEM'].value_counts().idxmax(), inplace=True)

df['CONTACT_COUNTRY'].fillna(
    df['CONTACT_COUNTRY'].value_counts().idxmax(), inplace=True)

df['CONTACT_REASON_LVL2_DESC'].fillna(
    df['CONTACT_REASON_LVL2_DESC'].value_counts().idxmax(), inplace=True)

df['CONTACT_REASON_LVL1_DESC'].fillna(
    df['CONTACT_REASON_LVL1_DESC'].value_counts().idxmax(), inplace=True)

df['CUSTOMER_CLASSIFICATION_TYPE'].fillna(
    df['CUSTOMER_CLASSIFICATION_TYPE'].value_counts().idxmax(), inplace=True)

df['CUSTOMER_CLASSIFICATION_PRODUCT'].fillna(
    df['CUSTOMER_CLASSIFICATION_PRODUCT'].value_counts().idxmax(), inplace=True)

df['CREATED_TO_CLOSED_DAYS'].fillna(
    df['CREATED_TO_CLOSED_DAYS'].median(), inplace=True)

df['CHANNEL'].fillna(
    df['CHANNEL'].value_counts().idxmax(), inplace=True)

df['CUSTOMER_CLASSIFICATION_ROLE'].fillna(
    df['CUSTOMER_CLASSIFICATION_ROLE'].value_counts().idxmax(), inplace=True)

# Handling Outliers

Extreme value can skew the data distribution and thus affect the model development we identify outliers in numeric variables and handle them by removing or capping.

### Outlier Detection with Percentiles

In [9]:
#Dropping the outlier rows with Percentiles

df = drop_outliers(df, exclude = ['INCIDENT_ID', 'NUMBER_OF_RESPONSES'])

# Binning

The main motivation of binning is to make the model more robust and prevent overfitting, however, it has a cost to the performance. Every time you bin something, you sacrifice information and make your data more regularized

The trade-off between performance and overfitting is the key point of the binning process


For Categorical variables the labels with low frequencies probably affect the robustness of statistical models negatively. Thus, assigning a general category to these less frequent values helps to keep the robustness of the model.

it is a good option to unite the labels with a count less than 100 to a new category like “Other”.

In [10]:
binning_summary = get_data_frame_summmary(df, 
                                    sort_by='unique_counts', 
                                    ascending=True)   

### Columns with less than 100 unique categories

In [11]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(binning_summary[binning_summary['unique_counts'] < 100])
    

Unnamed: 0,index,unique_values,unique_counts,data_type,percent_missing
14,INCIDENT_REOPENED,"[[No, Yes]]",2,object,0.0
10,INCIDENT_AUTO_SOLVED,"[[No, Yes]]",2,object,0.0
22,STATUS,"[[Solved, Waiting, Unresolved]]",3,object,0.0
20,SOURCE_LVL1_DESC,"[[CX Console, Public API, Utilities, End-User ...",8,object,0.0
0,CHANNEL,"[[Phone, Web, CSS Email, Email, Chat, Callback...",12,object,0.0
21,SOURCE_LVL2_DESC,"[[Contact Editor, Connect Web Services - SOAP,...",15,object,0.0
7,CUSTOMER_CLASSIFICATION_ROLE,"[[Print, Content, Researcher, Organization, AG...",17,object,0.0
15,INCIDENT_SYSTEM,"[[EES, Evise, Proof Central, PTS, Not Applicab...",28,object,0.0
16,NUMBER_OF_RESPONSES,"[[0, 1, 3, 2, 4, 7, 5, 6, 11, 8, 9, 12, 22, 16...",39,object,0.0
8,CUSTOMER_CLASSIFICATION_TYPE,"[[Purchaser, Other (customer), Reviewer, Admin...",63,object,0.0


In [12]:
columns_to_bin = binning_summary[binning_summary['unique_counts'] < 100]['index'].to_list()

In [13]:
# select columns with less than 100 labels
df = df.apply(lambda x: x.mask(x.map(x.value_counts())<10, 'Other') if x.name in columns_to_bin else x)

### After Binning - Columns with less than 100 unique categories

In [14]:
binning_summary = get_data_frame_summmary(df, 
                                    sort_by='unique_counts', 
                                    ascending=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(binning_summary[binning_summary['unique_counts'] < 100])

Unnamed: 0,index,unique_values,unique_counts,data_type,percent_missing
14,INCIDENT_REOPENED,"[[No, Yes]]",2,object,0.0
10,INCIDENT_AUTO_SOLVED,"[[No, Yes]]",2,object,0.0
22,STATUS,"[[Solved, Waiting, Unresolved]]",3,object,0.0
20,SOURCE_LVL1_DESC,"[[CX Console, Public API, Utilities, End-User ...",8,object,0.0
0,CHANNEL,"[[Phone, Web, CSS Email, Email, Chat, Callback...",12,object,0.0
21,SOURCE_LVL2_DESC,"[[Contact Editor, Connect Web Services - SOAP,...",13,object,0.0
7,CUSTOMER_CLASSIFICATION_ROLE,"[[Print, Content, Researcher, Organization, AG...",15,object,0.0
15,INCIDENT_SYSTEM,"[[EES, Evise, Proof Central, PTS, Not Applicab...",26,object,0.0
16,NUMBER_OF_RESPONSES,"[[0, 1, 3, 2, 4, 7, 5, 6, 11, 8, 9, 12, Other,...",26,object,0.0
8,CUSTOMER_CLASSIFICATION_TYPE,"[[Purchaser, Other (customer), Reviewer, Admin...",45,object,0.0


## Log Transform

Logarithm transformation (or log transform) is one of the most commonly used mathematical transformations

It helps to handle skewed data and after transformation, the distribution becomes more approximate to normal.
In most of the cases the magnitude order of the data changes within the range of the data. log transform normalizes the magnitude differences.

It also decreases the effect of the outliers, due to the normalization of magnitude differences and the model become more robust.

Important note: The data you apply log transform must have only positive values, otherwise you receive an error. Also, you can add 1 to your data before transform it. Thus, you ensure the output of the transformation to be positive.


In [15]:
# No Log transform applied 

## Scaling

In most cases, the numerical features of the dataset do not have a certain range and they differ from each other. Scaling solves this problem. The continuous features become identical in terms of the range, after a scaling process. This process is not mandatory for many algorithms, but it might be still nice to apply. 

### Normalization

Normalization (or min-max normalization) scale all values in a fixed range between 0 and 1. This transformation does not change the distribution of the feature and due to the decreased standard deviations, the effects of the outliers increases. Therefore, before normalization, it is recommended to handle the outliers

### Standardization
Standardization (or z-score normalization) scales the values while taking into account standard deviation. If the standard deviation of features is different, their range also would differ from each other. This reduces the effect of the outliers in the features.
In the following formula of standardization, the mean is shown as μ and the standard deviation is shown as σ.

In [16]:
df = scale_numeric_features(df, inplace=True)

********************* - Scaling following 0 features - **********************


## One-hot encoding

This method spreads the values in a column to multiple flag columns and assigns 0 or 1 to them. These binary values express the relationship between grouped and encoded column.

This method changes your categorical data, which is challenging to understand for algorithms, to a numerical format

If you have N distinct values in the column, it is enough to map them to N-1, as the missing value can be deduced from the other columns



In [17]:
print(columns_to_bin)

['INCIDENT_REOPENED', 'INCIDENT_AUTO_SOLVED', 'STATUS', 'SOURCE_LVL1_DESC', 'CHANNEL', 'SOURCE_LVL2_DESC', 'CUSTOMER_CLASSIFICATION_ROLE', 'INCIDENT_SYSTEM', 'NUMBER_OF_RESPONSES', 'CUSTOMER_CLASSIFICATION_TYPE']


In [18]:
# Specify Columns to encode
columns_to_exclude = []
columns_to_include = columns_to_bin
columns_to_encode = remove_items(columns_to_include, columns_to_exclude)
columns_to_encode

['INCIDENT_REOPENED',
 'INCIDENT_AUTO_SOLVED',
 'STATUS',
 'SOURCE_LVL1_DESC',
 'CHANNEL',
 'SOURCE_LVL2_DESC',
 'CUSTOMER_CLASSIFICATION_ROLE',
 'INCIDENT_SYSTEM',
 'NUMBER_OF_RESPONSES',
 'CUSTOMER_CLASSIFICATION_TYPE']

In [19]:
for column in columns_to_encode:
    encoded_columns = pd.get_dummies(df[column])
    print(f'Encoding collumns : {column} to {len(encoded_columns.columns)} new encoded columns')
    df = df.join(encoded_columns, rsuffix='_'+column).drop(column, axis=1)

Encoding collumns : INCIDENT_REOPENED to 2 new encoded columns
Encoding collumns : INCIDENT_AUTO_SOLVED to 2 new encoded columns
Encoding collumns : STATUS to 3 new encoded columns
Encoding collumns : SOURCE_LVL1_DESC to 7 new encoded columns
Encoding collumns : CHANNEL to 12 new encoded columns
Encoding collumns : SOURCE_LVL2_DESC to 12 new encoded columns
Encoding collumns : CUSTOMER_CLASSIFICATION_ROLE to 15 new encoded columns
Encoding collumns : INCIDENT_SYSTEM to 26 new encoded columns
Encoding collumns : NUMBER_OF_RESPONSES to 26 new encoded columns
Encoding collumns : CUSTOMER_CLASSIFICATION_TYPE to 45 new encoded columns


In [20]:
df.shape

(2402773, 163)

In [21]:
df.head()

Unnamed: 0,CONTACT_COUNTRY,CONTACT_REASON_LVL1_DESC,CONTACT_REASON_LVL2_DESC,CREATED_TO_CLOSED_DAYS,CREATED_TO_INITIAL_RESPONSE_DAYS,CUSTOMER_CLASSIFICATION_PRODUCT,ECR_ID,INCIDENT_CLOSED_DATETIME,INCIDENT_CREATED_DATETIME,INCIDENT_ID,...,Sales,Society employee,Society member,Student,Supplier,Trade/retailer,Trade/wholesaler,User,Web Importer,bepress_CUSTOMER_CLASSIFICATION_TYPE
0,Afghanistan,Shipping and Delivery,Delivery status,0.0,,Book,ECR-976446,2019-05-22 18:34:20.000,2019-05-22 18:34:20.000,8997235,...,0,0,0,0,0,0,0,0,0,0
1,United States,Author Profile Correction,Reprofile,21.6356,21.6356,Scopus,ECR-246394,2018-06-07 04:59:26.000,2018-05-16 13:44:10.000,5282566,...,0,0,0,0,0,0,0,0,0,0
2,United States,Login & Account,Retrieve username or reset password,5.89745,0.120428,Journal,ECR-1196,2018-05-08 15:44:25.000,2018-05-02 18:12:05.000,5304696,...,0,0,0,0,0,0,0,0,0,0
3,United States,Review,Reprofile,0.0,,Journal,ECR-1187218,2018-05-21 13:31:38.000,2018-05-21 13:31:38.000,5342999,...,0,0,0,0,0,0,0,0,0,0
4,United States,Training,Resources,0.951667,0.161644,DirectCourse,ECR-1046974,2018-05-09 17:57:07.000,2018-05-08 19:06:43.000,5296622,...,0,0,0,0,0,0,0,0,0,0


## Transform Date columns

Recency and Frequency Variables and length / duration.


In [22]:
# convert date columns to date data type

date_columns = ['INCIDENT_CREATED_DATETIME', 'INCIDENT_CLOSED_DATETIME']

In [23]:
for column in date_columns:
    df[column] = pd.to_datetime(df[column])

In [24]:
# calcualte new variable subscription_length and agreement_length
df['incident_length_to_close'] = months_between_columns(df['INCIDENT_CLOSED_DATETIME'],df['INCIDENT_CREATED_DATETIME'])

In [25]:
 df.head()

Unnamed: 0,CONTACT_COUNTRY,CONTACT_REASON_LVL1_DESC,CONTACT_REASON_LVL2_DESC,CREATED_TO_CLOSED_DAYS,CREATED_TO_INITIAL_RESPONSE_DAYS,CUSTOMER_CLASSIFICATION_PRODUCT,ECR_ID,INCIDENT_CLOSED_DATETIME,INCIDENT_CREATED_DATETIME,INCIDENT_ID,...,Society employee,Society member,Student,Supplier,Trade/retailer,Trade/wholesaler,User,Web Importer,bepress_CUSTOMER_CLASSIFICATION_TYPE,incident_length_to_close
0,Afghanistan,Shipping and Delivery,Delivery status,0.0,,Book,ECR-976446,2019-05-22 18:34:20,2019-05-22 18:34:20,8997235,...,0,0,0,0,0,0,0,0,0,0.0
1,United States,Author Profile Correction,Reprofile,21.6356,21.6356,Scopus,ECR-246394,2018-06-07 04:59:26,2018-05-16 13:44:10,5282566,...,0,0,0,0,0,0,0,0,0,1.0
2,United States,Login & Account,Retrieve username or reset password,5.89745,0.120428,Journal,ECR-1196,2018-05-08 15:44:25,2018-05-02 18:12:05,5304696,...,0,0,0,0,0,0,0,0,0,0.0
3,United States,Review,Reprofile,0.0,,Journal,ECR-1187218,2018-05-21 13:31:38,2018-05-21 13:31:38,5342999,...,0,0,0,0,0,0,0,0,0,0.0
4,United States,Training,Resources,0.951667,0.161644,DirectCourse,ECR-1046974,2018-05-09 17:57:07,2018-05-08 19:06:43,5296622,...,0,0,0,0,0,0,0,0,0,0.0


## Remove unused and/or redundant Features

Unused features are those that don’t make sense to pass into our machine learning algorithms. such as ID columns
Features that wouldn't be available at the time of prediction, Other text descriptions

Redundant features would typically be those that have been replaced by other features that you’ve added during feature engineering


In [26]:
columns_to_remove = ['CONTACT_REASON_LVL2_DESC', 'OWNER_NAME', 'RESOLUTION_CODE_LVL1_DESC', 
                     'CUSTOMER_CLASSIFICATION_PRODUCT', 'INCIDENT_CLOSED_DATETIME', 'INCIDENT_CREATED_DATETIME']

In [27]:
df = df.drop(columns_to_remove, axis=1)

In [28]:
df.head()

Unnamed: 0,CONTACT_COUNTRY,CONTACT_REASON_LVL1_DESC,CREATED_TO_CLOSED_DAYS,CREATED_TO_INITIAL_RESPONSE_DAYS,ECR_ID,INCIDENT_ID,OWNER_ID,No,Yes,No_INCIDENT_AUTO_SOLVED,...,Society employee,Society member,Student,Supplier,Trade/retailer,Trade/wholesaler,User,Web Importer,bepress_CUSTOMER_CLASSIFICATION_TYPE,incident_length_to_close
0,Afghanistan,Shipping and Delivery,0.0,,ECR-976446,8997235,HPCS Books REPH 1LS,1,0,1,...,0,0,0,0,0,0,0,0,0,0.0
1,United States,Author Profile Correction,21.6356,21.6356,ECR-246394,5282566,AFT,1,0,1,...,0,0,0,0,0,0,0,0,0,1.0
2,United States,Login & Account,5.89745,0.120428,ECR-1196,5304696,RS Global SPI 1LS,1,0,1,...,0,0,0,0,0,0,0,0,0,0.0
3,United States,Review,0.0,,ECR-1187218,5342999,,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,United States,Training,0.951667,0.161644,ECR-1046974,5296622,CLCS STL 2LS,1,0,1,...,0,0,0,0,0,0,0,0,0,0.0


Save clean data to hdf store

In [30]:
store = HDFStore(path_to_clean_hdf_datastore)
df.loc[:, ['CONTACT_COUNTRY', 'CREATED_TO_CLOSED_DAYS', 'ECR_ID', 'INCIDENT_ID', 'OWNER_ID']] = df.loc[:, ['CONTACT_COUNTRY', 'CREATED_TO_CLOSED_DAYS', 'ECR_ID', 'INCIDENT_ID', 'OWNER_ID']].astype(str)
store.put(key=cfg['interactions_file']+"_clean", value=df)
store.close()
