# Features

The purpose of this script is to merge together various pre-generated python objects (saved in pickle format) to create a feature matrix for subsequent modeling.

Requirements:
client_note.p (generated using client_note.py)
times.p (generated using client_note_times.py)
client_note_last.p (generated using client_note_times.ipynb)
client_note_first.p (generated using client_note_times.ipynb)
client_subscription.p (generated using client_subscription.py)
last_touch.p (generated using last_touch.ipynb)

Produces 
features.p


In [None]:
#Import libraries
%matplotlib inline
import logging
import collections
import datetime as dt
import os
import sys
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

In [None]:
dfClientNoteInteractionTypeCountEd = pickle.load(open( "client_note.p", "rb" ))
dfClientNoteInteractionTypeCountEd['client_id'] = dfClientNoteInteractionTypeCountEd.client_id.astype(int)
dfClientNoteInteractionTypeCountEd.head()

In [None]:
#Number of rows with missing data
dfClientNoteInteractionTypeCountEd.shape[0] - dfClientNoteInteractionTypeCountEd.dropna().shape[0]

In [None]:
dfLast = pickle.load(open( "client_note_last.p", "rb" ))
dfLast['client_id'] = dfLast.client_id.astype(int)
dfLast.head()

In [None]:
#Number of rows with missing data
dfLast.shape[0] - dfLast.dropna().shape[0]

In [None]:
dfFirst = pickle.load(open( "client_note_first.p", "rb" ))
dfFirst['client_id'] = dfFirst.client_id.astype(int)
#dfFirst = dfFirst['']
dfFirst.head()

In [None]:
dfTimes = pickle.load( open( "times.p", "rb" ) )
#print dfTimes.head()
dfTimes['client_id']=pd.DataFrame(dfTimes.client_id.astype(int))
dfTimes.head()

In [None]:
# Number of rows with missing data
dfTimes.shape[0] - dfTimes.shape[0]

In [None]:
dfSubscriptionCombined = pickle.load(open('client_subscription.p'))
dfSubscriptionCombined['client_id'] = dfSubscriptionCombined.client_id.astype(int)
dfSubscriptionCombined.head()

In [None]:
#Number of rows with missing data
dfSubscriptionCombined.shape[0] - dfSubscriptionCombined.dropna().shape[0]

# Merge Data

In [None]:
dfFeatures = pd.merge(dfClientNoteInteractionTypeCountEd,dfTimes,how='left')
dfFeatures.head()

In [None]:
#Number of rows with missing data
dfFeatures.shape[0] - dfFeatures.dropna().shape[0]

In [None]:
len(dfFeatures)

In [None]:
print dfFeatures.info()
print dfLast.info()
print dfFirst.info()

In [None]:
dfFeatures = pd.merge(dfFeatures,dfLast,left_on='client_id',right_on='client_id',how='left')
dfFeatures.head()

In [None]:
#Number of rows with missing data
dfFeatures.shape[0] - dfFeatures.dropna().shape[0]

In [None]:
print len(dfFeatures)

In [None]:
dfFeatures = pd.merge(dfFeatures,dfFirst,left_on='client_id',right_on='client_id',how='left')
dfFeatures.head()

In [None]:
#Number of rows with missing data
dfFeatures.shape[0] - dfFeatures.dropna().shape[0]

In [None]:
print len(dfFeatures)

In [None]:
dfFeatures = pd.merge(dfFeatures,dfSubscriptionCombined,left_on='client_id',right_on='client_id',how='left') #lose data here
dfFeatures.head()

In [None]:
#Number of rows with missing data
dfFeatures.shape[0] - dfFeatures.dropna().shape[0]

In [None]:
print len(dfFeatures)

In [None]:
dfFeatures[dfFeatures['client_id']==94]

In [None]:
dfLastTouch = pickle.load(open( "last_touch.p", "rb" ))
dfFeatures = pd.merge(dfFeatures,dfLastTouch,left_on='client_id',right_on='client_id',how='left')
dfFeatures.head()

In [None]:
#Number of rows with missing data
dfFeatures.shape[0] - dfFeatures.dropna().shape[0] #last touch 

In [None]:
print len(dfFeatures)


In [None]:
dfFeatures = dfFeatures.dropna()
print len(dfFeatures)

In [None]:
#dfFeatures['tenant_id'] = dfFeatures.tenant_id.astype(int)
dfFeatures['tenant_id'] = dfFeatures.tenant_id.astype(str)
dfFeatures['client_id'] = dfFeatures.client_id.astype(str)
dfFeatures['call'] = dfFeatures.call.astype(int)
dfFeatures['email'] = dfFeatures.email.astype(int)
dfFeatures['meeting'] = dfFeatures.meeting.astype(int)

dfFeatures['avg_interval'] = dfFeatures.avg_interval.astype(float)
dfFeatures['period_duration_sum'] = dfFeatures.period_duration_sum.astype(int)
dfFeatures['period_duration_mean'] = dfFeatures.period_duration_mean.astype(int)
dfFeatures['period_count'] = dfFeatures.period_count.astype(int)
dfFeatures['active_count'] = dfFeatures.active_count.astype(int)
dfFeatures['churned'] = dfFeatures.churned.astype(int)

# dfFeatures['call_first'] = dfFeatures.call_first.astype(int)
# dfFeatures['email_first'] = dfFeatures.email_first.astype(int)
# dfFeatures['meeting_first'] = dfFeatures.meeting_first.astype(int)

# dfFeatures['call_last'] = dfFeatures.call_last.astype(int)
# dfFeatures['email_last'] = dfFeatures.email_last.astype(int)
# dfFeatures['meeting_last'] = dfFeatures.meeting_last.astype(int)

#dfFeatures['days_since_last_touch'] = dfFeatures.days_since_last_touch.astype(int)



#Reorder columns
dfFeatures = dfFeatures[['tenant_id','client_id','call','email','meeting','avg_interval',\
                         'period_duration_sum','period_duration_mean','period_count',\
                        'days_since_last_touch','active_count','churned']]

# agg
# dfFeatures = dfFeatures[['tenant_id','client_id','call','email','meeting','avg_interval',\
#                          'period_duration_sum','period_duration_mean','period_count',\
#                         'active_count','churned','days_since_last_touch']]

# agg, first & last
# dfFeatures = dfFeatures[['tenant_id','client_id','call','email','meeting','avg_interval',\
#                          'call_first','email_first','meeting_first',\
#                          'call_last','email_last','meeting_last',\
#                          'period_duration_sum','period_duration_mean','period_count',\
#                         'active_count','churned','days_since_last_touch']]

dfFeatures.info()

In [None]:
dfFeatures.head()

In [None]:
dfFeatures.info()

In [None]:
pickle.dump(dfFeatures, open( "features.p", "wb" ))

In [None]:
#dfFeatures[dfFeatures['client_id']==151]

In [None]:
# print len(dfClientNoteInteractionTypeCountEd)
# print len(dfTimes)
# print len(dfSubscriptionCombined)
# print len(dfLast)
# print len(dfFirst)
# print len(dfLastTouch)

print dfClientNoteInteractionTypeCountEd.info()
print dfTimes.info()
print dfSubscriptionCombined.info()
print dfLast.info()
print dfLastTouch.info()

In [None]:
dfFeatures['email'] = dfFeatures.email/dfFeatures.period_duration_sum
dfFeatures['call'] = dfFeatures.call/dfFeatures.period_duration_sum
dfFeatures['meeting'] = dfFeatures.meeting/dfFeatures.period_duration_sum

dfFeatures['email_first'] = dfFeatures.email/dfFeatures.period_duration_mean
dfFeatures['call_first'] = dfFeatures.call/dfFeatures.period_duration_mean
dfFeatures['meeting_first'] = dfFeatures.meeting/dfFeatures.period_duration_mean

dfFeatures['email_last'] = dfFeatures.email/dfFeatures.period_duration_mean
dfFeatures['call_last'] = dfFeatures.call/dfFeatures.period_duration_mean
dfFeatures['meeting_last'] = dfFeatures.meeting/dfFeatures.period_duration_mean



In [None]:
dfFeatures.head()

In [None]:
pickle.dump(dfFeatures, open( "features_all.p", "wb" ))