# Interaction Information in Aggregate

The purpose of this script is to count the total number of emails, calls and meetings
for the duration of the relationship

produces client_note.p

Run this script before running feature.ipynb

In [None]:
#Import libraries
%matplotlib inline
import os
import sys
import pickle
import datetime as dt
import logging

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import mysql.connector

In [None]:
# Create database engine
dbname = os.environ["DBNAME"]
uname = os.environ["UNAME"]
passwd = os.environ["PASSWD"]
portnum = os.environ["PORTNUM"]

engine = create_engine('mysql+mysqlconnector://mydb_user:'+uname+'@localhost:'+portnum+'/'+dbname, echo=False)
# Connect to database
conn = mysql.connector.connect(
         user=uname,
         password=passwd,
         host='localhost',
         database=dbname)

# Load data


In [None]:
#client_note
#id, note, subject, created_by_employee_id, created_date_time, modified_date_time, client_id, file_path, interaction_type_id, external_system_id, source_id, thread_id
sql_query = """SELECT client_id, interaction_type_id, created_date_time FROM client_note;""" #source_id, thread_id
dfClientNote=pd.read_sql_query(sql_query,conn)
dfClientNote.head()


# Clean data

In [None]:
dfClientNote = dfClientNote[(dfClientNote.interaction_type_id==4) | (dfClientNote.interaction_type_id==3) | (dfClientNote.interaction_type_id==5)]
dfClientNote.head()

In [None]:
dfClientNote.info()

In [None]:
dfClientNote.describe()

# Initial exploration

In [None]:
print 'Total'
print len(dfClientNote)
print 'email'
dfClientNoteEmail = dfClientNote[dfClientNote.interaction_type_id==4]
print len(dfClientNoteEmail)
print 'call'
dfClientNoteCall = dfClientNote[dfClientNote.interaction_type_id==3]
print len(dfClientNoteCall)
print 'meeting'
dfClientNoteMeeting = dfClientNote[dfClientNote.interaction_type_id==5]
print len(dfClientNoteMeeting)
#Email 91%, Calls 4%, Calls 0% (other categories not informative)

In [None]:
# Client notes basic info:
len(dfClientNote.client_id.unique())
#notes about 20,378 clients

# Add date, year, month and day info

In [None]:
# Convert type timestamp to type date and \
# Create columns for year month and day
sDateTime = pd.to_datetime(dfClientNote['created_date_time'], '%d/%m/%y %H:%M')
year=sDateTime.dt.year
month=sDateTime.dt.month
day=sDateTime.dt.day
yearmonth = sDateTime.apply(lambda x: x.strftime('%Y-%m'))  #map(lambda x: x.year + x.month)
date = sDateTime.apply(lambda x: dt.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').date())
dfClientNote = dfClientNote.assign(date=date)
dfClientNote = dfClientNote.assign(year=year)
dfClientNote = dfClientNote.assign(month=month)
dfClientNote = dfClientNote.assign(day=day)
dfClientNote = dfClientNote.assign(yearmonth=yearmonth)
dfClientNote.head()

In [None]:
pickle.dump(dfClientNote, open( "client_note_expanded.p", "wb" ))

# Determine Relationship Length


In [None]:
dfGrouped = dfClientNote.groupby(['client_id'])

In [None]:
#Determine Relationship Length
def DetermineRelationshipLength(df):
    length = 0
    numInteractions = len(df)
    numDays = 0
    if numInteractions > 1:
        df = df.sort_values('created_date_time')            
        length = df['created_date_time'].iloc[-1] - df['created_date_time'].iloc[0]
        numDays = length.days
    s = pd.Series([numDays,])
    s = s.rename({0:'num_days'})
    return s

In [None]:
dfLength = dfGrouped.apply(lambda x: DetermineRelationshipLength(x))
dfLength = dfLength.reset_index()
dfLength.head()

# Calculate client note features

In [None]:
def DetermineClientNoteFeatures(df):
    numInteractions = int(len(df))
    if numInteractions > 1:
        try:
            df = df.sort_values('created_date_time')
            numEmails = len(df[df.interaction_type_id == 4])
            numCalls = len(df[df.interaction_type_id == 3])
            numMeetings = len(df[df.interaction_type_id == 5])
            df['diff'] = abs(df['created_date_time'] - df['created_date_time'].shift(-1))
            df = df[df['diff'].notnull()]
            df = df.reset_index()
            df['diff'] = df['diff'].astype('timedelta64[D]')
            meanGap = df['diff'].sum() / df['diff'].count()
            maxGap = max(df['diff'])
            minGap = min(df['diff'])            
            s = pd.Series([numInteractions,numEmails,numCalls,numMeetings,meanGap,maxGap,minGap]) #length,
            s = s.rename({0:'num_interactions',1:'num_emails',2:'num_calls',3:'num_meetings',4:'mean_gap',5:'max_gap',6:'min_gap'}) #length
            return s
        except Exception as err:
            logging.exception(err)
    return

In [None]:
dfClientNoteFeatures = dfGrouped.apply(lambda x: DetermineClientNoteFeatures(x))
dfClientNoteFeatures = dfClientNoteFeatures.reset_index()
dfClientNoteFeatures.head()

# Putting it together

In [None]:
dfLength.head()

In [None]:
dfClientNoteFeatures.head()

In [None]:
dfLength.describe()

In [None]:
dfClientNoteFeatures.describe()

In [None]:
dfMerge = pd.merge(dfLength,dfClientNoteFeatures,left_on='client_id',right_on='client_id')
dfMerge.head()

In [None]:
dfMerge['frequency'] = dfMerge['num_interactions']/dfMerge['num_days']
dfMerge['email_frequency'] = dfMerge['num_emails']/dfMerge['num_days']
dfMerge['call_frequency'] = dfMerge['num_calls']/dfMerge['num_days']
dfMerge['meeting_frequency'] = dfMerge['num_meetings']/dfMerge['num_days']

In [None]:
dfClientNoteFinal = dfMerge.dropna()

In [None]:
dfClientNoteFinal.loc[:,('num_interactions')] = dfClientNoteFinal['num_interactions'].astype(int)
dfClientNoteFinal.loc[:,('num_emails')] = dfClientNoteFinal['num_emails'].astype(int)
dfClientNoteFinal.loc[:,('num_calls')] = dfClientNoteFinal['num_calls'].astype(int)
dfClientNoteFinal.loc[:,('num_meetings')] = dfClientNoteFinal['num_meetings'].astype(int)
dfClientNoteFinal.loc[:,('max_gap')] = dfClientNoteFinal['max_gap'].astype(int)
dfClientNoteFinal.loc[:,('min_gap')] = dfClientNoteFinal['min_gap'].astype(int)

In [None]:
dfClientNoteFinal.describe()

In [None]:
dfClientNoteFinal.head()

In [None]:
pickle.dump(dfClientNoteFinal, open( "client_note_features.p", "wb" ))