In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**TODO 1 EDA**

**Get the Data**

We work with Ecommerce customers data. There are 8 columns in the data 

3 columns including Customers information such as Email, Address, and Avatar (String value)

It also has numeric columns
- Avg. Session Length: Average session length in-store.
- Time on App: Average time spent on App in minutes
- Time on Website: Average time spent on Website in minutes
- Length of Membership: How many years the customer has been a member
- Yearly Amount Spent: The amount of money the customer spent by year 

In [None]:
ecommerce_data=pd.read_csv('../input/ecommerce-customers/Ecommerce Customers.csv')

In [None]:
ecommerce_data.head()

In [None]:
ecommerce_data.info()

There is no missing data in the dataset for all columns

In [None]:
ecommerce_data['Avatar'].value_counts(normalize=True)

'Avatar' has 138 unique value, the other columns has 500 unique value

In [None]:
#Statistic Summary
ecommerce_data.describe()

**Exploratory Data Analysis**

**Univariate Analysis:**

In [None]:
ecommerce_data.hist(bins=20,grid = False,figsize=(20,20),color='#86bf91')

**Multivariate Analysis:**

Customer Yearly Amount Spent is the most important variable we focus on. We will make some charts to see the relationship between Yearly Amount Spend and other variables

Hypothesis
1. Avg. Session Length has correlation with Yearly Amount Spent 
2. Time on App has correlation with Yearly Amount Spent
3. Time on Website has correlation with Yearly Amount Spent
4. Length of Membership has correlation with Yearly Amount Spent



In [None]:
from numpy.random import randn
from numpy.random import seed
from scipy.stats import pearsonr

In [None]:
sns.jointplot(x='Avg. Session Length',y='Yearly Amount Spent',kind="scatter",data=ecommerce_data)


In [None]:
data1 = ecommerce_data['Avg. Session Length']
data2 = ecommerce_data['Yearly Amount Spent']

corr, _ = pearsonr(data1, data2)
print('Pearsons correlation: %.2f' % corr)

The Pearsons correlation is 0.36 -> This suggests there is average correlation between Ave. Session Length and Yearly Amount Spend

In [None]:
sns.jointplot(x='Time on Website',y='Yearly Amount Spent',kind="scatter",data=ecommerce_data)

In [None]:
data1 = ecommerce_data['Time on Website']
data2 = ecommerce_data['Yearly Amount Spent']

corr, _ = pearsonr(data1, data2)
print('Pearsons correlation: %.3f' % corr)

The Pearsons correlation is -0.003 -> This suggests there is no correlation between Time on Website and Yearly Amount Spend

In [None]:
sns.jointplot(x='Time on App',y='Yearly Amount Spent',kind="scatter",data=ecommerce_data)

In [None]:
data1 = ecommerce_data['Time on App']
data2 = ecommerce_data['Yearly Amount Spent']

corr, _ = pearsonr(data1, data2)
print('Pearsons correlation: %.2f' % corr)

The Pearsons correlation is 0.5 -> This suggests a strong and positive correlation between Time on App and Yearly Amount Spend

In [None]:
sns.jointplot(x='Length of Membership',y='Yearly Amount Spent',kind="scatter",data=ecommerce_data)

In [None]:
data1 = ecommerce_data['Length of Membership']
data2 = ecommerce_data['Yearly Amount Spent']

corr, _ = pearsonr(data1, data2)
print('Pearsons correlation: %.2f' % corr)

The Pearsons correlation is 0.81 -> This suggests a strong and positive correlation between Length of Membership and Yearly Amount Spend

***Summary of correlation between variables by heatmap ***

In [None]:
sns.heatmap(ecommerce_data.corr(),annot=True,cmap='Blues')

**Interaction between Length of Membership and Yearly Amount Spent**

In [None]:
sns.lmplot(x='Length of Membership',y='Yearly Amount Spent',data=ecommerce_data)

**TODO 2 - LINEAR REGRESSION**

In [None]:
#Data processing 
y = ecommerce_data['Yearly Amount Spent']
x = ecommerce_data[['Avg. Session Length', 'Time on App','Time on Website', 'Length of Membership']]

In [None]:
from sklearn.model_selection import train_test_split
# Split Train & Test data
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.3,random_state=42)

In [None]:
# Modelling 
from sklearn.linear_model import LinearRegression
reg =LinearRegression()
reg.fit(x_train,y_train)

In [None]:
coef=pd.DataFrame([x_train.columns, reg.coef_]).T
coef=coef.rename(columns={0:'Attributes',1:'Coefficients'})
coef

**Predictions: y_pred = f(X_test)**

In [None]:
ytest_pred = reg.predict(x_test)

In [None]:
#Evaluation
from sklearn import metrics
lin_acc=metrics.r2_score(ytest_pred, y_test)
print("R^2: ",lin_acc)
print("MAE: ", metrics.mean_absolute_error(ytest_pred, y_test))
print("MSE: ", metrics.mean_squared_error(ytest_pred, y_test))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(ytest_pred, y_test)))
print("Max Error: ", metrics.max_error(ytest_pred, y_test))

In [None]:
# R^2 = 0.98 -> The regression model fits the observations.

In [None]:
#Evaluation visualize
x_ax = range(len(x_test))
plt.plot(x_ax, y_test, lw=1, color="red", label="original")
plt.plot(x_ax, ytest_pred, lw=0.8, color="blue", label="predicted",marker="o", markersize=4)
plt.legend()
plt.show()

In [None]:
ecommerce_plot = pd.DataFrame({'pred_spent': ytest_pred, 'actual_spent': y_test})
ecommerce_plot.head()

In [None]:
sns.jointplot(x='pred_spent',y='actual_spent',data=ecommerce_plot,kind='reg')

**TODO 3 - CUSTOMER CLUSTERING**

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [None]:
kmeans = KMeans(n_clusters=6)
ecommerce_data["Cluster"] = kmeans.fit_predict(ecommerce_data[['Time on App', 
'Time on Website','Avg. Session Length', 'Length of Membership','Yearly Amount Spent']])
ecommerce_data["Cluster"] = ecommerce_data["Cluster"].astype("category")

ecommerce_data.head()

In [None]:
#Statistic
for i in ecommerce_data['Cluster'].unique():
    print('Cluster', i, ecommerce_data[ecommerce_data['Cluster'] == i].describe())

In [None]:
#Distribution
fig, axs = plt.subplots(2,2, figsize = (10,10))
sns.kdeplot(data = ecommerce_data, x = 'Time on App', hue = 'Cluster', ax = axs[0,0])
sns.kdeplot(data = ecommerce_data, x = 'Time on Website', hue = 'Cluster', ax = axs[1,0])
sns.kdeplot(data = ecommerce_data, x = 'Length of Membership', hue = 'Cluster', ax = axs[0,1])
sns.kdeplot(data = ecommerce_data, x = 'Avg. Session Length', hue = 'Cluster', ax = axs[1,1])

In [None]:
#Visualize Plot
sns.relplot(x = 'Length of Membership', y = 'Yearly Amount Spent', data = ecommerce_data, hue = 'Cluster')
sns.relplot(x = 'Time on App', y = 'Yearly Amount Spent', data = ecommerce_data, hue = 'Cluster')
sns.relplot(x = 'Time on Website', y = 'Yearly Amount Spent', data = ecommerce_data, hue = 'Cluster')
sns.relplot(x = 'Avg. Session Length', y = 'Yearly Amount Spent', data = ecommerce_data, hue = 'Cluster')
sns.relplot(x = 'Time on App', y = 'Length of Membership', data = ecommerce_data, hue = 'Cluster')

In [None]:
Average = ecommerce_data.groupby('Cluster').mean().reset_index()
Average.sort_values(by='Yearly Amount Spent')

From this table above, we can define cluster as follows:
- Cluster 2: New customer (< 2 years) 
- Cluster 4,1,3,5: Long term customer (2-5 years)
- Cluser 0: Loyal customers (> 5 years) 

From the chart, we can see that
- The more year customers with the brand (longer length of membership, the more they spent money) -> Focus more on the long term customer and loyal customers to retain the relationshop and push sales 
- The long term customer and loyal customers spend more time on app -> The brand can consider to develop more campaign on app