# Client subscription data

The purpose of this script is to create objects for the 
first and last subscription period
as well as the total duration of the relationship.

Requires client_subscription table

Produces:
first_subscription.p (data for first period)
last_subscription.p (data for last period)
client_subscription.p (data overall)

Engineered features (in first_subscription.p and last_subscription.p)
period_duration

Engineered features (in client_subscription):
period_duration_sum, period_duration_mean,
period_count, active_count, churned

Run this script before client_note_times.ipynb
Run this script before running feature.ipynb

In [None]:
#Import libraries
%matplotlib inline
import datetime as dt
import os
import sys
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import mysql.connector

# Load Data

In [None]:
# Create database engine
dbname = os.environ["DBNAME"]
uname = os.environ["UNAME"]
passwd = os.environ["PASSWD"]
portnum = os.environ["PORTNUM"]

engine = create_engine('mysql+mysqlconnector://mydb_user:'+uname+'@localhost:'+portnum+'/'+dbname, echo=False)
# Connect to database
conn = mysql.connector.connect(
         user=uname,
         password=passwd,
         host='localhost',
         database=dbname)

In [None]:
#client_subscription
#id tenant_id client_id client_note_id is_potential potential_renewed_by_id probability product_id quantity start_date end_date is_recurring amount auto_renew salesforce_opportunity_id salesforce_id termination_date renewed_date renewed_by_id created_date salesforce_opportunity_line_item_id ccp_id ccp_renewed_by_id ccp_forecast_amount created_from_id salesforce_forecast_id
sql_query = """SELECT tenant_id, client_id, created_date, start_date, end_date FROM client_subscription;"""
#renewed_date, termination_date
dfClientSubscription=pd.read_sql_query(sql_query,conn)
dfClientSubscription.head()


# Clean data

In [None]:
len(dfClientSubscription.client_id.unique())


In [None]:
dbDumpDate = dt.date(2016, 11, 2)
#dbDumpDate

In [None]:
# Clean subscription events
print 'Starting length'
print len(dfClientSubscription)
print

print 'dropping na'
dfClientSubscription = dfClientSubscription.dropna()
print len(dfClientSubscription)
print

print 'ensuring start and end date not the same'
dfClientSubscription = dfClientSubscription[dfClientSubscription['start_date'] != dfClientSubscription['end_date']]
print len(dfClientSubscription)
print

print 'ensuring start date is before db dump date'
dfClientSubscription = dfClientSubscription[dfClientSubscription.start_date.apply(lambda x: x <= dbDumpDate)]
print len(dfClientSubscription)
print

In [None]:
dfClientSubscription.sort_values('end_date',ascending=True).head()

The earliest date is July 1995.

# Calculate Duration for Each Subscription Period
One row per subscription

In [None]:
# Find period duration for each SUBSCRIPTION PERIOD

dfClientSubscription['period_duration'] = dfClientSubscription.end_date - dfClientSubscription.start_date
dfClientSubscription = dfClientSubscription[dfClientSubscription['period_duration'] > dt.timedelta(days = 0)]
# print 'Ensuring subscription duration more than one day'
# print len(dfClientSubscription)
# print dfClientSubscription.head()
dfClientSubscription['period_duration'] = dfClientSubscription.period_duration.apply(lambda x: x.days)
dfClientSubscription.head()
#len(dfClientSubscription) #45,806

In [None]:
dfClientSubscription.sort_values('period_duration',ascending=False)

In [None]:
dfClientSubscription.describe()

In [None]:
plt.hist(dfClientSubscription.period_duration)
#plt.show()
plt.savefig('period_duration_hist.png')
plt.close()

In [None]:
xvals=list(range(0,len(dfClientSubscription)))
yvals=list(dfClientSubscription.period_duration)
#print len(xvals)
#print len(yvals)
plt.bar(xvals,yvals) #,marker='.',s=1)
#plt.show()
plt.savefig('period_duration_bar.png')
plt.close()

# Determine activity status for each subscription period
One row per subscription

In [None]:
# Defining SUBSCRIPTION EVENT active as end date beyond db dump date
dfClientSubscription['active'] = dfClientSubscription.end_date.apply(lambda x: x > dbDumpDate)
dfClientSubscription.head()

In [None]:
dfActive = dfClientSubscription[dfClientSubscription.active==True]
dfActive.head()

In [None]:
dfInactive = dfClientSubscription[dfClientSubscription.active==False]
dfInactive.head()

In [None]:
print 'Activity status by client subscription events'
print len(dfClientSubscription)
print 'Active'
print len(dfActive)
print 'Inactive'
print len(dfInactive)
# More active than inactive subscription

In [None]:
print 'Activity status by client'
print len(dfClientSubscription.client_id.unique())
print len(dfActive.client_id.unique())
print len(dfInactive.client_id.unique())

In [None]:
# Of the 19335 clients,
#16,672 have an active subscription
#10,376 have an inactive subscription
# Note, there's overlap between the groups

In [None]:
# print 'Relationship Activity'
# print 'Active'
# print len(dfClientSubscriptionStatus[dfClientSubscriptionStatus.active>=0]) #19409 relationships active
# print 'Inactive'
# print len(dfClientSubscriptionStatus[dfClientSubscriptionStatus.active==0]) #2737 relationships inactive

Check for concurrent subscriptions

In [None]:
dfClientSubscription.head()

In [None]:
# Save file for one row per subscription
pickle.dump(dfClientSubscription, open( "client_subscription_expanded.p", "wb" ))

# Calculate Client Subscription Features


One row per client

In [None]:
def CalculateSubscriptionFeatures(df):
    df = df.sort_values('created_date')
    num = len(df)
    mean_duration = df['period_duration'].mean()
    total_duration = df['period_duration'].sum()
    min_duration = min(df['period_duration'])
    max_duration = max(df['period_duration'])
    startDate = min(df['start_date'])
    endDate = max(df['end_date'])
    
    length = endDate - startDate
    length = length.days
    
    activeCount = df['active'].astype(int).sum()
    if activeCount >= 1:
        churned = 0
    else:
        churned = 1
    s = pd.Series([num,mean_duration,total_duration,min_duration,max_duration,activeCount,churned,startDate,endDate,length])
    s = s.rename({0:'num_periods',1:'mean_duration',2:'total_duration',3:'min_duration',4:'max_duration',5:'active_count',6:'churned',7:'start_date',8:'end_date',9:'subscription_length'})    
    return s

In [None]:
grouped = dfClientSubscription.groupby(['tenant_id','client_id'])

In [None]:
dfClientSubscriptionFeatures = grouped.apply(lambda x: CalculateSubscriptionFeatures(x))
dfClientSubscriptionFeatures = dfClientSubscriptionFeatures.reset_index()

In [None]:
dfClientSubscriptionFeatures.head()

In [None]:
dfClientSubscriptionFeatures.describe()

In [None]:
plt.hist(dfClientSubscriptionFeatures.active_count,bins=range(0,24)) #previously active_count
#plt.show()
plt.savefig('active_count.png')
plt.close()

The minimum number of subscription periods was 0, the mean was  and the maximum was 23.

In [None]:
plt.hist(dfClientSubscriptionFeatures.total_duration) #previously period_duration_sum
#plt.show()
plt.savefig('total_duration_sum.png')
plt.close()

In [None]:
plt.hist(dfClientSubscriptionFeatures.mean_duration) #previously period_duration_mean
#plt.show()
plt.savefig('mean_duration.png')
plt.close()

In [None]:
plt.hist(dfClientSubscriptionFeatures.num_periods) #previously period_count
#plt.show()
plt.savefig('num_periods.png')
plt.close()

In [None]:
pickle.dump(dfClientSubscriptionFeatures, open( "client_subscription_features.p", "wb" ))

# Subset first and last subscription periods
One row per client

In [None]:
#Find activity in first period
dfClientSubscriptionFirst = dfClientSubscription[dfClientSubscription.end_date.apply(lambda x: x <= dbDumpDate)]
dfClientSubscriptionFirstSorted = dfClientSubscriptionFirst.sort_values('end_date',ascending=True)
dfClientSubscriptionFirstSortedGrouped = dfClientSubscriptionFirstSorted.groupby(['tenant_id','client_id'])
dfFirstSubscription = dfClientSubscriptionFirstSortedGrouped.first().reset_index()
dfFirstSubscription.head()

In [None]:
dfFirstSubscription.describe()

In [None]:
pickle.dump(dfFirstSubscription, open( "first_subscription.p", "wb" ))

In [None]:
#Find activity in most recent period
dfClientSubscriptionRecent = dfClientSubscription[dfClientSubscription.end_date.apply(lambda x: x <= dbDumpDate)]
dfClientSubscriptionRecentSorted = dfClientSubscriptionRecent.sort_values('end_date',ascending=False)
dfClientSubscriptionRecentSortedGrouped = dfClientSubscriptionRecentSorted.groupby(['tenant_id','client_id'])
dfLastSubscription = dfClientSubscriptionRecentSortedGrouped.first().reset_index()
dfLastSubscription.head()

In [None]:
dfLastSubscription.describe()

In [None]:
pickle.dump(dfLastSubscription, open( "last_subscription.p", "wb" ))