# Interaction Information in Aggregate

The purpose of this script is to count the total number of emails, calls and meetings
for the duration of the relationship

produces client_note.p

Run this script before running feature.ipynb

In [None]:
#Import libraries
%matplotlib inline
import os
import sys
import pickle
import datetime as dt
import logging

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import mysql.connector

In [None]:
# Create database engine
dbname = os.environ["DBNAME"]
uname = os.environ["UNAME"]
passwd = os.environ["PASSWD"]
portnum = os.environ["PORTNUM"]

engine = create_engine('mysql+mysqlconnector://mydb_user:'+uname+'@localhost:'+portnum+'/'+dbname, echo=False)
# Connect to database
conn = mysql.connector.connect(
         user=uname,
         password=passwd,
         host='localhost',
         database=dbname)

# Load data


In [None]:
#client_note
#id, note, subject, created_by_employee_id, created_date_time, modified_date_time, client_id, file_path, interaction_type_id, external_system_id, source_id, thread_id
sql_query = """SELECT client_id, interaction_type_id, created_date_time FROM client_note;""" #source_id, thread_id
dfClientNote=pd.read_sql_query(sql_query,conn)
dfClientNote.head()


# Clean data

In [None]:
dfClientNote = dfClientNote[(dfClientNote.interaction_type_id==4) | (dfClientNote.interaction_type_id==3) | (dfClientNote.interaction_type_id==5)]
dfClientNote.head()

In [None]:
dfClientNote.info()

In [None]:
dfClientNote.describe()

In [None]:
# Look at a case
dfClientNote.ix[0,:]

# Initial exploration

In [None]:
print 'Total'
print len(dfClientNote)
print 'email'
dfClientNoteEmail = dfClientNote[dfClientNote.interaction_type_id==4]
print len(dfClientNoteEmail)
print 'call'
dfClientNoteCall = dfClientNote[dfClientNote.interaction_type_id==3]
print len(dfClientNoteCall)
print 'meeting'
dfClientNoteMeeting = dfClientNote[dfClientNote.interaction_type_id==5]
print len(dfClientNoteMeeting)
#Email 91%, Calls 4%, Calls 0% (other categories not informative)

In [None]:
# Client notes basic info:
len(dfClientNote.client_id.unique())
#notes about 20,378 clients

# Add date, year, month and day info

In [None]:
# Convert type timestamp to type date and \
# Create columns for year month and day
sDateTime = pd.to_datetime(dfClientNote['created_date_time'], '%d/%m/%y %H:%M')
year=sDateTime.dt.year
month=sDateTime.dt.month
day=sDateTime.dt.day
yearmonth = sDateTime.apply(lambda x: x.strftime('%Y-%m'))  #map(lambda x: x.year + x.month)
date = sDateTime.apply(lambda x: dt.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').date())
dfClientNote = dfClientNote.assign(date=date)
dfClientNote = dfClientNote.assign(year=year)
dfClientNote = dfClientNote.assign(month=month)
dfClientNote = dfClientNote.assign(day=day)
dfClientNote = dfClientNote.assign(yearmonth=yearmonth)
dfClientNote.head()

In [None]:
pickle.dump(dfClientNote, open( "client_note_expanded.p", "wb" ))

# Determine Relationship Length


In [None]:
dfGrouped = dfClientNote.groupby(['client_id'])

In [None]:
#Determine Relationship Length
def DetermineRelationshipLength(df):
    length = 0
    numInteractions = len(df)
    numDays = 0
    if numInteractions > 1:
        df = df.sort_values('created_date_time')            
        length = df['created_date_time'].iloc[-1] - df['created_date_time'].iloc[0]
        numDays = length.days
    s = pd.Series([numDays,])
    s = s.rename({0:'num_days'})
    return s

In [None]:
dfLength = dfGrouped.apply(lambda x: DetermineRelationshipLength(x))
dfLength = dfLength.reset_index()
dfLength.head()

# Characterize communication gaps


In [None]:
def DetermineFeatures(df):
    numInteractions = int(len(df))
    if numInteractions > 1:
        try:
            df = df.sort_values('created_date_time')
#             length = df['created_date_time'].iloc[-1] - df['created_date_time'].iloc[0]
#             numDays = length.days
            df['diff'] = abs(df['created_date_time'] - df['created_date_time'].shift(-1))
            df = df[df['diff'].notnull()]
            df = df.reset_index()
            df['diff'] = df['diff'].astype('timedelta64[D]')
            meanGap = df['diff'].sum() / df['diff'].count()
            maxGap = max(df['diff'])
            minGap = min(df['diff'])            
            s = pd.Series([numInteractions,meanGap,maxGap,minGap]) #length,
            s = s.rename({0:'num_interactions',1:'mean_gap',2:'max_gap',3:'min_gap'}) #length
            return s
        except Exception as err:
            logging.exception(err)
    return

In [None]:
dfGaps = dfGrouped.apply(lambda x: DetermineFeatures(x))
dfGaps = dfGaps.reset_index()
dfGaps.head()

# Count total interactions
Total number of interactions per client

In [None]:
#Total number of interactions for each client####################### include
dfInteractions = dfClientNote[['client_id','interaction_type_id']]
dfInteractionCount=dfInteractions.groupby(['client_id']).sum() #.agg(['count'])
dfInteractionCount=dfInteractionCount.reset_index()
dfInteractionCount=dfInteractionCount.rename(columns={'interaction_type_id':'total_interaction_count'})
dfInteractionCount.head()
# 

In [None]:
#Reindex by client and type of interction
dfInteractions.sort_values('client_id',inplace=True)
dfInteractionReindexed=dfClientNote.set_index('client_id','interaction_type_id')
dfInteractionReindexed.head()
# Total number of interactions is 1,573,652

# Count interactions by type

In [None]:
# How many of each type of interaction occurred
dfClientNoteInteractionTypeCount=dfInteractions.groupby(['client_id','interaction_type_id']).size().reset_index()
dfClientNoteInteractionTypeCount=dfClientNoteInteractionTypeCount.rename(columns={0:'count'})
dfClientNoteInteractionTypeCount.head()

In [None]:
# One row per client, count of different types of interactions
dfClientNoteInteractionTypeCountEd = dfClientNoteInteractionTypeCount.pivot(index='client_id',columns='interaction_type_id')
df=dfClientNoteInteractionTypeCountEd
dfClientNoteInteractionTypeCountEd.columns=dfClientNoteInteractionTypeCountEd.columns.droplevel(0)
dfClientNoteInteractionTypeCountEd=pd.DataFrame()

dfClientNoteInteractionTypeCountEd['email']=df[4]
dfClientNoteInteractionTypeCountEd['call']=df[3]
dfClientNoteInteractionTypeCountEd['meeting']=df[5]

dfClientNoteInteractionTypeCountEd['email'] = dfClientNoteInteractionTypeCountEd.email.fillna(0)
dfClientNoteInteractionTypeCountEd['call'] = dfClientNoteInteractionTypeCountEd.call.fillna(0)
dfClientNoteInteractionTypeCountEd['meeting'] = dfClientNoteInteractionTypeCountEd.meeting.fillna(0)

dfClientNoteInteractionTypeCountEd['email'] = dfClientNoteInteractionTypeCountEd.email.astype(int)
dfClientNoteInteractionTypeCountEd['call'] = dfClientNoteInteractionTypeCountEd.call.astype(int)
dfClientNoteInteractionTypeCountEd['meeting'] = dfClientNoteInteractionTypeCountEd.meeting.astype(int)

dfClientNoteInteractionTypeCountEd=dfClientNoteInteractionTypeCountEd.reset_index()
dfClientNoteInteractionTypeCountEd=dfClientNoteInteractionTypeCountEd.fillna('0')
dfClientNoteInteractionTypeCountEd.head()
#RESERVE FOR MERGING

In [None]:
dfClientNoteInteractionTypeCountEd.info()

In [None]:
dfClientNoteInteractionTypeCountEd.describe()

In [None]:
pickle.dump(dfClientNoteInteractionTypeCountEd, open( "client_note.p", "wb" ))

In [None]:
sClientNote = set(dfClientNoteInteractionTypeCountEd.client_id)

In [None]:
pickle.dump(sClientNote, open( "client_note_set.p", "wb" ))

# Putting it together

In [None]:
dfLength.head()

In [None]:
dfGaps.head()

In [None]:
dfClientNoteInteractionTypeCountEd.head()

In [None]:
dfLength.describe()

In [None]:
dfGaps.describe()

In [None]:
dfClientNoteInteractionTypeCountEd.describe()

In [None]:
mrg1 = pd.merge(dfLength,dfGaps,left_on='client_id',right_on='client_id')
mrg2 = pd.merge(mrg1,dfClientNoteInteractionTypeCountEd,left_on='client_id',right_on='client_id')
mrg2.head()

In [None]:
mrg2['frequency'] = mrg2['num_interactions']/mrg2['num_days']

In [None]:
dfFinal = mrg2.dropna()
dfFinal.head()

In [None]:
dfFinal.describe()

In [None]:
pickle.dump(dfFinal, open( "client_note_final.p", "wb" ))