In [None]:
# The purpose of this script is to convert client note data 
# from one row per interaction to one row per client

# This script provides info about the first and last subscription period:
# produces client_note_last.p
# produces client_note_first.p

# Run this script after running client_subscription.ipynb
# Run this script before running feature.ipynb

In [None]:
#Import libraries
%matplotlib inline
import logging
import collections
import datetime as dt
import sys
import os
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import mysql.connector

In [None]:
# Create database engine
dbname = 'cs'
username = 'rjf'
engine = create_engine('mysql+mysqlconnector://mydb_user:rjf@localhost:5432/cs', echo=False)
print engine.url

passwd = os.environ["PASSWD"]
# Connect to database
conn = mysql.connector.connect(
         user='rjf',
         password=passwd,
         host='localhost',
         database='cs')

In [None]:
#id, note, subject, created_by_employee_id, created_date_time, modified_date_time, client_id, file_path, interaction_type_id, external_system_id, source_id, thread_id
sql_query = """SELECT client_id, interaction_type_id, created_date_time FROM client_note;"""
dfClientNote=pd.read_sql_query(sql_query,conn)
dfClientNote.head()

In [None]:
# Convert type timestamp to type date and columns for year month and day

sDateTime = pd.to_datetime(dfClientNote['created_date_time'], '%d/%m/%y %H:%M')
year=sDateTime.dt.year
month=sDateTime.dt.month
day=sDateTime.dt.day
date = sDateTime.apply(lambda x: dt.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').date())
dfClientNote = dfClientNote.assign(date=date)
dfClientNote = dfClientNote.assign(year=year)
dfClientNote = dfClientNote.assign(month=month)
dfClientNote = dfClientNote.assign(day=day)
dfClientNote.head()

In [None]:
dfTimes = pd.Series()

In [None]:
# Given dataframe with timestamps, sort by timestamp, compute time difference and a mean
def CalcAverageInterval(df):
    try:
        df = df.sort_values('created_date_time')
        df['diff'] = abs(df['created_date_time'] - df['created_date_time'].shift(-1))
        df = df[df['diff'].notnull()]
        df = df.reset_index()
        df = df.copy()
        df['diff'] = df['diff'].astype('timedelta64[D]')
        mean = df['diff'].sum() / df['diff'].count()
        client_id = str(df['client_id'].unique()[0])
        dfTimes[client_id]=mean
    except:
        pass
    return

In [None]:
dfClientNoteGrouped = dfClientNote.groupby(['client_id']) #,'created_date_time'
#dfClientNoteGrouped = dfClientNoteGrouped.size().reset_index()
#dfClientNoteGrouped
# dfClientNoteGrouped['mean'] = 
dfClientNoteGrouped.apply(CalcAverageInterval)
dfTimes = pd.DataFrame(dfTimes,columns=['avg_interval']).reset_index()
dfTimes = dfTimes.rename(columns={'index':'client_id'})
dfTimes['client_id'] = dfTimes.client_id.astype(int)
dfTimes['avg_interval'] = dfTimes.avg_interval.astype(float)
dfTimes.info()
dfTimes.head()

In [None]:
pickle.dump(dfTimes, open( "times.p", "wb" ))

In [None]:
##########################

In [None]:
dfLastSubscription = pickle.load( open( "last_subscription.p", "rb" ) )
dfLastSubscription.head()

In [None]:
dfLastSubscriptionGrouped = dfLastSubscription.groupby(['tenant_id','client_id'])

In [None]:
dfClientNote.head()

In [None]:
dfLastSubscription.head()

In [None]:
# SIMPLE MERGE

dfMerge = pd.merge(dfClientNote,dfLastSubscription)
print dfMerge.head()
print len(dfMerge)

In [None]:
dfMergePeriod = dfMerge[dfMerge['date']>= dfMerge['start_date']]
dfMergePeriod = dfMerge[dfMerge['date']<= dfMerge['end_date']]
print dfMergePeriod.head()
print len(dfMergePeriod)


In [None]:
dfMergePeriod.head()

In [None]:
def InteractionLevelToClientLevel(df,periodID):
    df['unit']=1
    df = df[['client_id','interaction_type_id','unit']]
    df = df.groupby(['client_id','interaction_type_id']).sum().unstack(level=1)
    cols = list(df.columns.droplevel(0))
    df.columns = cols
    
    dfEd = pd.DataFrame()
    dfEd['email_'+periodID] = 0
    dfEd['call_'+periodID] = 0
    dfEd['meeting_'+periodID] = 0
    
    try:
        dfEd['email_'+periodID] = df[4]
        dfEd['email_'+periodID] = dfEd.email.fillna(0)
        dfEd['email_'+periodID] = dfEd.email.astype(int)
    except:
        pass

    try:
        dfEd['call_'+periodID] = df[3]
        dfEd['call_'+periodID] = dfEd.call.fillna(0)
        dfEd['call_'+periodID] = dfEd.call.astype(int)
    except:
        pass

    try:
        dfEd['meeting_'+periodID] = df[5]
        dfEd['meeting_'+periodID] = dfEd.meeting.fillna(0)
        dfEd['meeting_'+periodID] = dfEd.meeting.astype(int)
    except:
        pass

    dfEd = dfEd.fillna('0')
    dfEd = dfEd.reset_index()
    print dfEd.head()

    return dfEd

In [None]:
dfLast = InteractionLevelToClientLevel(dfMergePeriod,'last')

In [None]:
pickle.dump(dfLast, open( "client_note_last.p", "wb" ))

In [None]:
dfFirstSubscription = pickle.load( open( "first_subscription.p", "rb" ) )
dfFirstSubscription.head()

In [None]:
dfMerge = pd.merge(dfClientNote,dfFirstSubscription)
print dfMerge.head()
print len(dfMerge)

In [None]:
dfMergePeriod = dfMerge[dfMerge['date']>= dfMerge['start_date']]
dfMergePeriod = dfMerge[dfMerge['date']<= dfMerge['end_date']]
print dfMergePeriod.head()
print len(dfMergePeriod)

In [None]:
dfFirst = InteractionLevelToClientLevel(dfMergePeriod,'first')

In [None]:
pickle.dump(dfFirst, open( "client_note_first.p", "wb" ))