In [1]:
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd # for dataframes
import matplotlib.pyplot as plt # for plotting graphs
import seaborn as sns # for plotting graphs
import datetime as dt
import numpy as np

In [3]:
from google.oauth2 import service_account #For GCP Account connection
from google.cloud import bigquery
#Setting Up Project ID
project_id = "digitalmarketing-a2-247505"
#Setting up the credential file
cred = service_account.Credentials.from_service_account_file('digitalMarketing-a2-470cdee74823.json')

ModuleNotFoundError: No module named 'google'

In [None]:
#Creating a BigQuery Dataset:
client = bigquery.Client(project=project_id, credentials=cred)

In [None]:
sql = """
SELECT t.*
FROM `digitalmarketing-a2-247505.digitalMarketing_Assignment2.Transactions` t
"""

In [None]:
data = client.query(sql).to_dataframe()

In [None]:
data.head()

In [None]:
data.columns= ['InvoiceNo','ProductCode','InvoiceDate','CustomerID','Promotion_ID','UnitPrice','Quantity','Total_Amount']

In [None]:
data.head()

In [None]:
filtered_data=data[['CustomerID']].drop_duplicates()

In [None]:
#Top ten country's customer
#filtered_data.Country.value_counts()[:10].plot(kind='bar')

In [None]:
#uk_data=data[data.Country=='United Kingdom']
#uk_data.info()

In [None]:
#uk_data.describe()

In [None]:
data = data[(data['Quantity']>0)]
data.info()

In [None]:
data=data[['CustomerID','InvoiceDate','InvoiceNo','Quantity','UnitPrice']]

In [None]:
#Calulate total purchase
data['TotalPurchase'] = data['Quantity'] * data['UnitPrice']

In [None]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

In [None]:
data_group=data.groupby('CustomerID').agg({'InvoiceDate': lambda date: (date.max() - date.min()).days,
                                        'InvoiceNo': lambda num: len(num),
                                        'Quantity': lambda quant: quant.sum(),
                                        'TotalPurchase': lambda price: price.sum()})

In [None]:
data_group.head()

In [None]:
data_group.columns=['num_days','num_transactions','num_units','spent_money']
data_group.head()

In [None]:
# Average Order Value
data_group['avg_order_value']=data_group['spent_money']/data_group['num_transactions']

In [None]:
data_group.head()

In [None]:
purchase_frequency=sum(data_group['num_transactions'])/data_group.shape[0]

In [None]:
# Repeat Rate
repeat_rate=data_group[data_group.num_transactions > 1].shape[0]/data_group.shape[0]

In [None]:
#Churn Rate
churn_rate=1-repeat_rate

In [None]:
purchase_frequency,repeat_rate,churn_rate

Let's assume our business has approx 5% profit on the total sale.

In [None]:
# Profit Margin
data_group['profit_margin']=data_group['spent_money']*0.05
data_group.head()

In [None]:
# Customer Value
data_group['CLV']=(data_group['avg_order_value']*purchase_frequency)/churn_rate
#Customer Lifetime Value
data_group['cust_lifetime_value']=data_group['CLV']*data_group['profit_margin']
data_group.head()

In [None]:
data.head()

In [None]:
data['month_yr'] = data['InvoiceDate'].apply(lambda x: x.strftime('%b-%Y'))

data.head()

In [None]:
sale=data.pivot_table(index=['CustomerID'],columns=['month_yr'],values='TotalPurchase',aggfunc='sum',fill_value=0).reset_index()

sale.head()

In [None]:
sale['CLV']=sale.iloc[:,2:].sum(axis=1)

sale.head()

In [None]:
X=sale[['Dec-2018','Nov-2018', 'Oct-2018','Sep-2018','Aug-2018','Jul-2018']]
y=sale[['CLV']]

In [None]:
#split training set and test set
#from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)

In [None]:
# import model
from sklearn.linear_model import LinearRegression

# instantiate
linreg = LinearRegression()

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

# make predictions on the testing set
y_pred = linreg.predict(X_test)

In [None]:
# print the intercept and coefficients
print(linreg.intercept_)
print(linreg.coef_)

In [None]:
from sklearn import metrics

# compute the R Square for model
print("R-Square:",metrics.r2_score(y_test, y_pred))

In [None]:
# calculate MAE using scikit-learn
print("MAE:",metrics.mean_absolute_error(y_test,y_pred))

#calculate mean squared error
print("MSE",metrics.mean_squared_error(y_test, y_pred))
# compute the RMSE of our predictions
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))