In [1]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.3f' % x)

%matplotlib inline

In [2]:
# Load data
data1 = pd.read_csv("take_home_data/data/names_id_age.csv")
data2 = pd.read_csv("take_home_data/data/lead_sale_stats.csv")

In [3]:
# data1 characteristics
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1000 non-null   int64 
 1   name       1000 non-null   object
 2   age        1000 non-null   int64 
 3   lead_id    1000 non-null   int64 
 4   lead_type  1000 non-null   object
dtypes: int64(3), object(2)
memory usage: 39.2+ KB


In [4]:
# data1 characteristics
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   lead_id        996 non-null    object
 1   name           1000 non-null   object
 2   bought_policy  1000 non-null   int64 
 3   policy_amount  1000 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 31.4+ KB


In [5]:
# data1 sample observations
data1.head()

Unnamed: 0,id,name,age,lead_id,lead_type
0,0,AN4UFZ08R,40,1197608,B
1,1,M0XPQP,41,1116417,C
2,2,LUH4V4F9,45,1125118,A
3,3,KVC2IK,47,449886,A
4,4,3CIXG65M6W,53,668018,B


In [6]:
# data1 sample observations
data2.head()

Unnamed: 0,lead_id,name,bought_policy,policy_amount
0,1197608_b,AN4UFZ08R,0,0
1,c_1116417,M0XPQP,1,403
2,a_1125118,LUH4V4F9,1,367
3,a_449886,KVC2IK,0,0
4,b_668018,3CIXG65M6W,0,0


In [7]:
# Note: although I think the names are unique and equal in both dataframes, I will still be merging via lead_id,
#       as larger datasets could have duplicate names
# Fix lead type variable in 2nd dataframe
data2[["split1", "split2"]] = data2.lead_id.str.split("_", expand=True)
data2.loc[(data2.split1<data2.split2),"lead_id"] = data2.split1
data2.loc[(data2.split1<data2.split2),"lead_type"] = data2.split2
data2.loc[(data2.split1>data2.split2),"lead_id"] = data2.split2
data2.loc[(data2.split1>data2.split2),"lead_type"] = data2.split1
data2 = data2.drop(columns=["split1", "split2"])

In [8]:
# Fix some other variables
data2 = data2.dropna()
data2["lead_type"] = data2["lead_type"].str.upper()
data2["lead_id"] = data2["lead_id"].astype(int)

In [9]:
# Make sure everything went okay
data2.head()

Unnamed: 0,lead_id,name,bought_policy,policy_amount,lead_type
0,1197608,AN4UFZ08R,0,0,B
1,1116417,M0XPQP,1,403,C
2,1125118,LUH4V4F9,1,367,A
3,449886,KVC2IK,0,0,A
4,668018,3CIXG65M6W,0,0,B


In [10]:
# Merge dataframes
data = pd.merge(data1, data2, on = "lead_id", how = "inner")

In [11]:
# Make sure everything went okay
data.head()

Unnamed: 0,id,name_x,age,lead_id,lead_type_x,name_y,bought_policy,policy_amount,lead_type_y
0,0,AN4UFZ08R,40,1197608,B,AN4UFZ08R,0,0,B
1,1,M0XPQP,41,1116417,C,M0XPQP,1,403,C
2,2,LUH4V4F9,45,1125118,A,LUH4V4F9,1,367,A
3,3,KVC2IK,47,449886,A,KVC2IK,0,0,A
4,4,3CIXG65M6W,53,668018,B,3CIXG65M6W,0,0,B


In [12]:
# Check if names are all equal
sum(data["name_x"] == data["name_y"])

996

In [13]:
# Check if policy types are all equal
sum(data["lead_type_x"] == data["lead_type_y"])

996

In [14]:
# Drop unnecessary information
drops = ["id", "name_x", "lead_id", "lead_type_x", "name_y"]
data = data.drop(columns = drops)
data = data.rename(columns={"lead_type_y": "lead_type"})

In [15]:
# Make sure everything turned out okay
data.head()

Unnamed: 0,age,bought_policy,policy_amount,lead_type
0,40,0,0,B
1,41,1,403,C
2,45,1,367,A
3,47,0,0,A
4,53,0,0,B


Info we have: policy conversions, policy amounts, ages, lead types

Questions to answer: how does lead type correlate with age, conversion rate, purchase amount

The histograms I made didn't reveal much, and the scales of the variables are too various for barcharts, so I decided to make a table for my figure.

In [16]:
# Generate Table
table = data.groupby("lead_type").mean()[["age", "bought_policy"]]
table["Count"] = data["lead_type"].value_counts()
table["bought_policy"] = table["bought_policy"] * 100
table["Avg. Purchase"] = data[data.bought_policy==1].groupby("lead_type").mean()["policy_amount"]
table["Revenue"] = data.groupby("lead_type").sum()["policy_amount"]
table = table.rename(columns={"age": "Avg. Age", "bought_policy": "Conversion Rate"})
table = table[["Count", "Avg. Age", "Conversion Rate", "Avg. Purchase", "Revenue"]]
table.index = table.index.rename("Lead Type")
table

Unnamed: 0_level_0,Count,Avg. Age,Conversion Rate,Avg. Purchase,Revenue
Lead Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,344,49.782,55.814,346.49,66526
B,330,50.755,52.424,374.0,64702
C,322,50.376,59.627,370.693,71173


From this table, we see that each lead type had similar amounts and average ages, but type C leads had the highest conversion rate, a high average purchase, and the highest total revenue. Therefore, I would recommend that the executive team focus efforts on researching type C leads to better discern why they're so profitable.