In [1]:
# Import pandas 
import pandas as pd

# Load the customer_data
customer_data = pd.read_csv('customer_data.csv')

# Load the app_purchases
app_purchases = pd.read_csv('inapp_purchases.csv')

# Print the columns of customer data
print('Customer Data:', customer_data.columns)

# Print the columns of app_purchases
print('Purchase Data:', app_purchases.columns)

Customer Data: Index(['uid', 'reg_date', 'device', 'gender', 'country', 'age'], dtype='object')
Purchase Data: Index(['date', 'uid', 'sku', 'price'], dtype='object')


In [2]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   uid       10000 non-null  float64
 1   reg_date  10000 non-null  object 
 2   device    10000 non-null  object 
 3   gender    10000 non-null  object 
 4   country   10000 non-null  object 
 5   age       10000 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 468.9+ KB


In [3]:
app_purchases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9006 entries, 0 to 9005
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    9006 non-null   object
 1   uid     9006 non-null   int64 
 2   sku     9006 non-null   object
 3   price   9006 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 281.6+ KB


In [4]:
customer_data.head()

Unnamed: 0,uid,reg_date,device,gender,country,age
0,54030035.0,2017-06-29T00:00:00Z,and,M,USA,19
1,72574201.0,2018-03-05T00:00:00Z,iOS,F,TUR,22
2,64187558.0,2016-02-07T00:00:00Z,iOS,M,USA,16
3,92513925.0,2017-05-25T00:00:00Z,and,M,BRA,41
4,99231338.0,2017-03-26T00:00:00Z,iOS,M,FRA,59


In [5]:
app_purchases.head()

Unnamed: 0,date,uid,sku,price
0,2017-07-10,41195147,sku_three_499,499
1,2017-07-15,41195147,sku_three_499,499
2,2017-11-12,41195147,sku_four_599,599
3,2017-09-26,91591874,sku_two_299,299
4,2017-12-01,91591874,sku_four_599,599


Both customer_data and app_purchases have a common 'uid' column that we can use to combine them. They also have a common date column that is named 'date' in app_purchases and 'reg_date' in customer_data.

We will merge on both of these columns and looking at how this impacts the final results.



In [6]:
# renaming reg_date column to date in customer_data

customer_data_mod = customer_data.rename(columns={"reg_date": "date"})
customer_data_mod.columns

Index(['uid', 'date', 'device', 'gender', 'country', 'age'], dtype='object')

In [7]:
# Convert to datetime format
customer_data_mod["date"] = pd.to_datetime(customer_data_mod["date"])

# Format to only include the date
customer_data_mod["date"] = customer_data_mod["date"].dt.strftime('%Y-%m-%d')

In [8]:
customer_data_mod.head(5)

Unnamed: 0,uid,date,device,gender,country,age
0,54030035.0,2017-06-29,and,M,USA,19
1,72574201.0,2018-03-05,iOS,F,TUR,22
2,64187558.0,2016-02-07,iOS,M,USA,16
3,92513925.0,2017-05-25,and,M,BRA,41
4,99231338.0,2017-03-26,iOS,M,FRA,59


In [9]:
# Merge customer_data with app_purchases, combining on the 'uid' column.
customer_purchase_data = app_purchases.merge(customer_data_mod, on=['uid'], how='inner')

# Examine the results 
print(customer_purchase_data.head())

       date_x       uid            sku  price      date_y device gender  \
0  2017-07-10  41195147  sku_three_499    499  2017-06-26    and      M   
1  2017-07-15  41195147  sku_three_499    499  2017-06-26    and      M   
2  2017-11-12  41195147   sku_four_599    599  2017-06-26    and      M   
3  2017-09-26  91591874    sku_two_299    299  2017-01-05    and      M   
4  2017-12-01  91591874   sku_four_599    599  2017-01-05    and      M   

  country  age  
0     BRA   17  
1     BRA   17  
2     BRA   17  
3     TUR   17  
4     TUR   17  


In [10]:
print(len(customer_purchase_data))

9006


To look at purchases that happened on the date of registration, merge customer_data to app_purchases on 'uid' and 'date'.

In [11]:
# Merge on the 'uid' and 'date' field
purchases_on_registration = app_purchases.merge(customer_data_mod, on=['uid', 'date'], how='inner')

# Examine the results 
print(purchases_on_registration.head())

         date       uid             sku  price device gender country  age
0  2016-03-30  94055095    sku_four_599    599    iOS      F     BRA   16
1  2015-10-28  69627745     sku_one_199    199    and      F     BRA   18
2  2017-02-02  11604973  sku_seven_1499    499    and      F     USA   16
3  2016-06-05  22495315    sku_four_599    599    and      F     USA   19
4  2018-02-17  51365662     sku_two_299    299    iOS      M     TUR   16


In [12]:
print(len(purchases_on_registration))

35


Note the second result returned fewer rows compared to the first one - 35 compared to 9006! This is because there were fewer matches

Exploring the in-app purchase data

In [13]:
# Calculate the mean purchase price 

purchase_price_mean = customer_purchase_data.price.agg('mean')

# Examine the output 
print(purchase_price_mean)

406.77259604707973


In [14]:
# Calculate the mean and median purchase price 
purchase_price_summary = customer_purchase_data.price.agg(['mean', 'median'])

# Examine the output 
print(purchase_price_summary)

mean      406.772596
median    299.000000
Name: price, dtype: float64


In [15]:
# Calculate the mean and median of price and age
purchase_summary = customer_purchase_data.agg({'price': ['mean', 'median'], 'age': ['mean', 'median']})

# Examine the output 
print(purchase_summary)

             price        age
mean    406.772596  23.922274
median  299.000000  21.000000


Summary Statistics about the purchase data broken out by 'device' (Android or iOS) and 'gender' (Male or Female).

In [16]:
# Group the data 
grouped_purchase_data = customer_purchase_data.groupby(by = ['device', 'gender'])

# Aggregate the data
purchase_summary = grouped_purchase_data.agg({'price': ['mean', 'median', 'std']})

# Examine the results
print(purchase_summary)

                    price                   
                     mean median         std
device gender                               
and    F       400.747504  299.0  179.984378
       M       416.237308  499.0  195.001520
iOS    F       404.435330  299.0  181.524952
       M       405.272401  299.0  196.843197


These values offer valuable insights into customer behavior for conversion rate optimization.