In [1]:
# Set up the notebook to import modules from relative paths
import os, sys

#'/home/user/example/parent/child'
current_path = os.path.abspath('.')

#'/home/user/example/parent'
parent_path = os.path.dirname(current_path)

sys.path.append(parent_path)

In [2]:
from data_pipeline import ETL_Pipeline 

dp = ETL_Pipeline('/workspace/shared-data/email-campaign/')
transformed_df = dp.process('sent_emails.csv','responded.csv','userbase.csv')
transformed_df.head()

Unnamed: 0,SubjectLine_ID,Gender,Type,Email_Domain,Age_Group,Tenure_Group,Response_Received,Sent_Day
0,2,1,0,2,3,2,1,3
1,2,1,1,3,2,4,0,2
2,3,1,1,2,2,2,0,2
3,1,1,1,2,2,3,0,2
4,3,1,1,4,2,1,1,2


In [3]:
import pandas as pd
import numpy as np
import sklearn

from IPython.display import display, HTML

# Display Properties
from IPython.display import display, HTML
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 2)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [4]:
df = pd.read_csv('/workspace/shared-data/email-campaign/email_campaign_data.csv')
df.head()

Unnamed: 0,SubjectLine_ID,Gender,Type,Email_Domain,Age_Group,Tenure_Group,Response_Received,Sent_Day
0,2,1,0,2,3,2,1,3
1,2,1,1,3,2,4,0,2
2,3,1,1,2,2,2,0,2
3,1,1,1,2,2,3,0,2
4,3,1,1,4,2,1,1,2


In [5]:
from dataset import Email_Dataset

# Create a email campaign dataset with 5 folds 
ecd = Email_Dataset(df,'Response_Received')

# Get training dataset for a fold
training_df = ecd.get_training_dataset(2)
training_df.head()

Unnamed: 0,SubjectLine_ID,Gender,Type,Email_Domain,Age_Group,Tenure_Group,Sent_Day,Response_Received
0,2,1,0,2,3,2,3,1
1,2,1,1,3,2,4,2,0
2,3,1,1,2,2,2,2,0
3,1,1,1,2,2,3,2,0
4,3,1,1,4,2,1,2,1


In [6]:
# Get testing dataset for a fold
testing_df = ecd.get_testing_dataset(2)
testing_df.head()

Unnamed: 0,SubjectLine_ID,Gender,Type,Email_Domain,Age_Group,Tenure_Group,Sent_Day,Response_Received
0,2,0,1,4,2,2,3,0
1,2,1,0,4,3,3,5,0
2,1,1,1,2,2,5,6,0
3,3,1,1,5,0,2,4,0
4,1,0,1,2,2,3,5,0


In [7]:
print(f"We have {len(training_df)} training responses and {len(testing_df)} testing responses")

We have 1981083 training responses and 495271 testing responses


In [9]:
from model import Email_Campaign_Model
from metrics import Metrics

metrics = Metrics()

# Train the Model using entire dataset and with defaults of conversion threshold of 30% and sent emails as 10
model = Email_Campaign_Model(df)

Here we will build the state table


In [None]:
qtable_df = model.train(iterations=10000, starting_state = (1,0,0,0,0,0,0), epsilon = 0.1, alpha = 0.1,gamma = 0.6)
qtable_df.head()

In [None]:
qtable_df.to_csv('q_table.csv')

In [11]:
# Load the model again but now with the Q Table generated from training and we now don't need to consider the email sent 
# threshold as we have trained the model to get Q Table Values using that
saved_model = Email_Campaign_Model(df, (1,0,0,0,0,0,0), 1, 0.25, 'q_table.csv', model.get_states())

# Run metrics using this Q Table across different splits
folds = []
avg_conv = []
median_conv = []
min_conv = []
max_conv = [] 

# Run through all the folds
for i in range(5):
    # Get training for the fold and get metrics
    train = ecd.get_training_dataset(i)
    avg, median, min_conversion, max_conversion = saved_model.test(train,10)
    folds += ['Training Fold ' + str(i)]
    avg_conv += [avg]
    median_conv += [median]
    min_conv += [min_conversion]
    max_conv += [max_conversion] 
    print('Completed fold ' + str(i))

metrics.generate_report(folds,avg_conv,median_conv,min_conv,max_conv,'../results/training-results.txt')

We have 83 responded emails out of 205 sent for subject 1 with conversion rate 0.40487804878048783
We have 24 responded emails out of 55 sent for subject 1 with conversion rate 0.43636363636363634
We have 217 responded emails out of 777 sent for subject 3 with conversion rate 0.27927927927927926
We have 38 responded emails out of 97 sent for subject 1 with conversion rate 0.3917525773195876
We have 65 responded emails out of 139 sent for subject 1 with conversion rate 0.4676258992805755
We have 3 responded emails out of 7 sent for subject 2 with conversion rate 0.42857142857142855
We have 125 responded emails out of 300 sent for subject 1 with conversion rate 0.4166666666666667
We have 264 responded emails out of 820 sent for subject 3 with conversion rate 0.32195121951219513
We have 131 responded emails out of 491 sent for subject 3 with conversion rate 0.2668024439918534
We have 140 responded emails out of 493 sent for subject 1 with conversion rate 0.2839756592292089
Completed fold 

In [12]:
# Run metrics using this Q Table across different splits
folds = []
avg_conv = []
median_conv = []
min_conv = []
max_conv = [] 

# Run through all the folds
for i in range(5):
    # Get training for the fold and get metrics
    test = ecd.get_testing_dataset(i)
    avg, median, min_conversion, max_conversion = saved_model.test(test,10)
    folds += ['Training Fold ' + str(i)]
    avg_conv += [avg]
    median_conv += [median]
    min_conv += [min_conversion]
    max_conv += [max_conversion] 

metrics.generate_report(folds,avg_conv,median_conv,min_conv,max_conv,'../results/testing-results.txt')

We have 355 responded emails out of 1081 sent for subject 1 with conversion rate 0.3283996299722479
We have 28 responded emails out of 84 sent for subject 3 with conversion rate 0.3333333333333333
We have 41 responded emails out of 180 sent for subject 2 with conversion rate 0.22777777777777777
We have 50 responded emails out of 202 sent for subject 2 with conversion rate 0.24752475247524752
We have 53 responded emails out of 217 sent for subject 2 with conversion rate 0.24423963133640553
We have 57 responded emails out of 234 sent for subject 2 with conversion rate 0.24358974358974358
We have 65 responded emails out of 253 sent for subject 2 with conversion rate 0.25691699604743085
We have 254 responded emails out of 686 sent for subject 2 with conversion rate 0.37026239067055394
We have 24 responded emails out of 54 sent for subject 2 with conversion rate 0.4444444444444444
We have 59 responded emails out of 150 sent for subject 2 with conversion rate 0.3933333333333333
We have 501 r