# Interaction Information Over Time

The purpose of this script is to convert client note data 
from one row per interaction to one row per client

This script provides info about the first and last subscription period:
requires access to client_note table
produces times.p (based just on client_note)

Requires:
    last_subscription.p (subscription events for last period)
    first_subscription.p (subscription events for first period)
    both generated by client_subscription.ipynb)

Produces:
    client_note_last.p (last period, one row per tenant-client relationship)
    client_note_first.p (first period, one row per tenant-client relationship)

Engineered features in client_note_last:
email_last, call_last, meeting_last
Engineered features in client_note_first:
email_first, call_first, meeting_first

Run this script after running client_subscription.ipynb
Run this script before running feature.ipynb

In [None]:
#Import libraries
%matplotlib inline
import logging
import collections
import datetime as dt
import sys
import os
import pickle

import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import mysql.connector

# Load Data

In [None]:
# Create database engine
dbname = os.environ["DBNAME"]
uname = os.environ["UNAME"]
passwd = os.environ["PASSWD"]
portnum = os.environ["PORTNUM"]

engine = create_engine('mysql+mysqlconnector://mydb_user:'+uname+'@localhost:'+portnum+'/'+dbname, echo=False)
# Connect to database
conn = mysql.connector.connect(
         user=uname,
         password=passwd,
         host='localhost',
         database=dbname)

In [None]:
# Load client note table
#id, note, subject, created_by_employee_id, created_date_time, modified_date_time, client_id, file_path, interaction_type_id, external_system_id, source_id, thread_id
sql_query = """SELECT client_id, interaction_type_id, created_date_time FROM client_note;"""
dfClientNote=pd.read_sql_query(sql_query,conn)
dfClientNote.head()

# Extract year, month and day


In [None]:
# Convert type timestamp to type date and \
# Create columns for year month and day
sDateTime = pd.to_datetime(dfClientNote['created_date_time'], '%d/%m/%y %H:%M')
year=sDateTime.dt.year
month=sDateTime.dt.month
day=sDateTime.dt.day
date = sDateTime.apply(lambda x: dt.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').date())
dfClientNote = dfClientNote.assign(date=date)
dfClientNote = dfClientNote.assign(year=year)
dfClientNote = dfClientNote.assign(month=month)
dfClientNote = dfClientNote.assign(day=day)
dfClientNote.head()

# Part 1
Given dataframe with timestamps (pertaining to one client),
sort by timestamp, compute time difference 
and a mean interval between communication
assumes the dataframe only pertains to one client
client should be determined beforehand

In [None]:

def CalcAverageInterval(df):
    try:
        #print df
        client_id = str(df['client_id'].unique()[0])
        # print client_id
        df = df.sort_values('created_date_time')
        df['diff'] = abs(df['created_date_time'] - df['created_date_time'].shift(-1))
        df = df[df['diff'].notnull()]
        df = df.reset_index()
        df = df.copy()
        df['diff'] = df['diff'].astype('timedelta64[D]')
        mean = df['diff'].sum() / df['diff'].count()
        # print df
    except Exception as err:
        logging.exception(err)
    return mean

In [None]:
dfClientNoteGrouped = dfClientNote.groupby(['client_id'])
dfTimes = pd.DataFrame()
dfTimes['avg_interval'] = dfClientNoteGrouped.apply(lambda x: CalcAverageInterval(x))
dfTimes = pd.DataFrame(dfTimes,columns=['avg_interval']).reset_index()
dfTimes = dfTimes.rename(columns={'index':'client_id'})
dfTimes['client_id'] = dfTimes.client_id.astype(int)
dfTimes['avg_interval'] = dfTimes.avg_interval.astype(float)
dfTimes.info()
dfTimes.head()

In [None]:
print len(dfTimes)

In [None]:
pickle.dump(dfTimes, open( "times.p", "wb" ))

# Part 2 
Given a multi-client dataframe of interactions. Count number of emails, calls and meetings for each client. One row per client. Time window should be determined beforehand.

In [None]:

def InteractionLevelToClientLevel(df,periodID):
    df['unit']=1
    df = df[['client_id','interaction_type_id','unit']]
    dftemp = df.groupby(['client_id','interaction_type_id']).sum().unstack(level=1)
    # print type(dftemp)
    dftemp = dftemp.fillna(0)
    cols = list(dftemp.columns.droplevel(0))
    dftemp.columns = cols

    dfEd = pd.DataFrame()    
    dfEd['email_'+periodID] = 0
    dfEd['call_'+periodID] = 0
    dfEd['meeting_'+periodID] = 0

    try:
        dfEd['email_'+periodID] = dftemp[4]
#         dfEd['email_'+periodID] = dfEd.email.fillna(0)
#         dfEd['email_'+periodID] = dfEd.email.astype(int)
    except Exception as err:
        logging.exception(err)

    try:
        dfEd['call_'+periodID] = dftemp[3]
#         dfEd['call_'+periodID] = dfEd.call.fillna(0)
#         dfEd['call_'+periodID] = dfEd.call.astype(int)
    except Exception as err:
        logging.exception(err)

    try:
        dfEd['meeting_'+periodID] = dftemp[5]
#         dfEd['meeting_'+periodID] = dfEd.meeting.fillna(0)
#         dfEd['meeting_'+periodID] = dfEd.meeting.astype(int)
    except Exception as err:
        logging.exception(err)

#    dfEd = dfEd.fillna('0')

    dfEd['email_'+periodID] = dfEd['email_'+periodID].astype(int)
    dfEd['call_'+periodID] = dfEd['call_'+periodID].astype(int)
    dfEd['meeting_'+periodID] = dfEd['meeting_'+periodID].astype(int)

    dfEd = dfEd.reset_index()
    #print dfEd.head()

    return dfEd

# Convert Last Subscription Period to Client-Level Info

In [None]:
dfLastSubscription = pickle.load( open( "last_subscription.p", "rb" ) )
#dfLastSubscription.head()
dfLastSubscriptionGrouped = dfLastSubscription.groupby(['tenant_id','client_id'])
dfClientNote.head()
dfLastSubscription.head()

In [None]:
# SIMPLE MERGE OF CLIENT NOTE AND SUBSCRIPTION

dfMerge = pd.merge(dfClientNote,dfLastSubscription,left_on='client_id',right_on='client_id',how='left')
print dfMerge.head()
print len(dfMerge)
print 
dfMergePeriod = dfMerge[dfMerge['date']>= dfMerge['start_date']]
dfMergePeriod = dfMerge[dfMerge['date']<= dfMerge['end_date']]
print dfMergePeriod.head()
print len(dfMergePeriod)

In [None]:
print len(dfMerge)
print len(dfMerge.dropna())
print len(dfMergePeriod)
print len(dfMergePeriod.dropna())

In [None]:
# Convert from interaction-level to client-level
dfLast = InteractionLevelToClientLevel(dfMergePeriod,'last')
dfLast.head()

In [None]:
print len(dfLast)

In [None]:
pickle.dump(dfLast, open( "client_note_last.p", "wb" ))

# Convert First Subscription Period to Client-Level Info

In [None]:
# Load first subscription period data from file
dfFirstSubscription = pickle.load( open( "first_subscription.p", "rb" ) )
print dfFirstSubscription.head()

In [None]:
# Client note data with first subscription period data
# Filter dates
dfMerge = pd.merge(dfClientNote,dfFirstSubscription,left_on='client_id',right_on='client_id',how='left')
print dfMerge.head()
print len(dfMerge)
dfMergePeriod = dfMerge[dfMerge['date']>= dfMerge['start_date']]
dfMergePeriod = dfMerge[dfMerge['date']<= dfMerge['end_date']]
print dfMergePeriod.head()
print len(dfMergePeriod)

In [None]:
print len(dfMerge)
print len(dfMerge.dropna())
print len(dfMergePeriod)
print len(dfMergePeriod.dropna())

In [None]:
# Convert interaction-level date to client-level data
dfFirst = InteractionLevelToClientLevel(dfMergePeriod,'first')
dfFirst.head()

In [None]:
print len(dfFirst)

In [None]:
pickle.dump(dfFirst, open( "client_note_first.p", "wb" ))

In [None]:
print len(dfClientNote)
print len(dfTimes)
print '###'
#print len(dfMerge)
#print len(dfMergePeriod)
print len(dfLastSubscription)
print len(dfLast)
print len(dfFirstSubscription)
print len(dfFirst)

In [None]:
#dfFirst.head()

In [None]:
#dfLast.head()