In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from scipy.stats.mstats import winsorize # To trim extreme values

from sklearn.model_selection import train_test_split # To create train / test sets from available data
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.cluster import KMeans

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import warnings

warnings.filterwarnings('ignore')

# Load Data

In [None]:
df = pd.read_csv('../input/ecommerce-customers/Ecommerce Customers.csv')
df.head()

# TO-DO 1: EDA

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Standardize column names for easy handling
df = df.rename(columns = {'Email': 'EMAIL', 'Address': 'ADD', 'Avatar': 'AVA', 'Avg. Session Length': 'ASL', 'Time on App': 'ToA', 'Time on Website': 'ToW', 'Length of Membership': 'MEM', 'Yearly Amount Spent': 'SPENT'})
df.head()

In [None]:
# Check missing values in variables
df.isnull().sum()

## Tổng Quan
* Có tất cả 500 quan sát và 8 biến
* Trong số 8 biến, có 3 biến categorical (Email, Address, Avatar) và 5 biến numeric
* Không có giá trị missing hay bị duplicated

## Univariate Analysis: 
* Các biến Email, Address, Avatar có thể không liên quan như các biến numeric, nhưng vẫn có thể được nghiên cứu để hiểu thêm về customers. Ví dụ, customers chủ yếu dùng email có domain là gì, customers chủ yếu sinh sống ở đâu, màu sắc thường được sử dụng trong avatar của customers, v.v.)

In [None]:
# Distribution of Average Session Length
df.ASL.hist()
plt.title('Distribution of Average Session Length')

In [None]:
# Distribution of Time on App
df.ToA.hist()
plt.title('Distribution of Time on App')

In [None]:
# Distribution of Time on Website
df.ToW.hist()
plt.title('Distribution of Time on Website')

In [None]:
# Distribution of Length of Membership
df.MEM.hist()
plt.title('Distribution of Length of Membership')

In [None]:
# Distribution of Yearly Amount Spent
df.SPENT.hist()
plt.title('Distribution of Yearly Amount Spent')

## Multivariate Analysis: 
* Time on App có xu hướng tỷ lệ thuận với Time on Website nhưng không quá nhiều
* Trong khi đó, Length of Membership tỷ lệ thuận với Yearly Amount Spent và mức độ tương quan khá rõ, nghĩa là thành viên càng lâu thì càng có xu hướng tiêu dùng nhiều hơn

In [None]:
# Time on App vs Time on Website
sns.regplot(x=df['ToA'], y=df['ToW'])
plt.title('Time on App vs Time on Website')
plt.legend()

In [None]:
# Length of Membership vs Yearly Amount Spent
sns.regplot(x=df['MEM'], y=df['SPENT'])
plt.title('Length of Membership vs Yearly Amount Spent')
plt.legend()

# TO-DO 2: LINEAR REGRESSIONS

# Data Processing: Outliers

In [None]:
# Create a function to remove outliers
def treat_outliers(dataframe):
    cols = list(dataframe)
    for col in cols:
        if col in dataframe.select_dtypes(include=np.number).columns:
            dataframe[col] = winsorize(dataframe[col], limits=[0.05, 0.05], inclusive=(True, True))
    return dataframe

## Remove Outliers - ASL

In [None]:
# Histogram of Average Session Length after outlier removal

# Create a second dataframe
df1 = df.copy()
df1 = treat_outliers(df1)

df1.ASL.hist()
plt.title('Histogram of Average Session Length after outlier removal')

## Remove Outliers - ToA

In [None]:
df1.ToA.hist()
plt.title('Histogram of Time on App after outlier removal')

## Remove Outliers - ToW

In [None]:
df1.ToW.hist()
plt.title('Histogram of Time on Website after outlier removal')

## Remove Outliers - MEM

In [None]:
df1.MEM.hist()
plt.title('Histogram of Length of Membership after outlier removal')

## Remove Outliers - SPENT

In [None]:
df1.SPENT.hist()
plt.title('Histogram of Yearly Amount Spent after outlier removal')

# Data Processing: y, X

In [None]:
# Get all numeric columns except SPENT column
x = df1.iloc[:,3:-1]

# Get SPENT column only
y = df1.iloc[:, -1]

In [None]:
# Create train sets & test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Linear Regression: y = f(X)

In [None]:
reg = LinearRegression().fit(X_train, y_train)

In [None]:
# Get coefficients
coef = pd.DataFrame([X_train.columns, reg.coef_]).T
coef = coef.rename(columns={0:'Attributes', 1:'Coefficients'})
coef

# Predictions: y_pred = f(X_test)

In [None]:
y_test_pred = reg.predict(X_test)

# Evaluation: y_test vs. y_pred

In [None]:
lin_acc=metrics.r2_score(y_test_pred, y_test)
print("R^2: ", lin_acc)
print("MAE: ", metrics.mean_absolute_error(y_test_pred, y_test))
print("MSE: ", metrics.mean_squared_error(y_test_pred, y_test))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test_pred, y_test)))
print("Max Error: ", metrics.max_error(y_test_pred, y_test))

# Evaluation: Visualize

In [None]:
x_ax = range(len(X_test))
plt.plot(x_ax, y_test, lw=1, color='blue', label='original')
plt.plot(x_ax, y_test_pred, lw=0.8, color='red', label='predicted', marker='o', markersize=4)
plt.legend()
plt.show()

In [None]:
df_plot = pd.DataFrame({'pred_SPENT': y_test_pred, 'actual_SPENT': y_test})
df_plot.head()

In [None]:
sns.jointplot(x='pred_SPENT', y='actual_SPENT', data=df_plot, kind='reg')

# TO-DO 3: CUSTOMER CLUSTERING

In [None]:
# Set style
plt.style.use('seaborn-whitegrid')
plt.rc('figure', autolayout=True)
plt.rc(
    'axes',
    labelweight='bold',
    labelsize='large',
    titleweight='bold',
    titlesize=14,
    titlepad=10,
)

In [None]:
# Create a dataframe with only numeric columns
df_num = df1.iloc[:, 3:]
df_num.head()

In [None]:
# Create cluster feature
kmeans = KMeans(n_clusters=4)

# Compute cluster centers & predict cluster index for each sample
df_num['Cluster'] = kmeans.fit_predict(df_num)

df_num['Cluster'] = df_num['Cluster'].astype('category')

df_num.head()

In [None]:
# Average Session Length vs Length of Membership
sns.relplot(x='ASL', y='MEM', hue='Cluster', data=df_num, height=4)

In [None]:
# Time on App vs Yearly Amount Spent
sns.relplot(x='ToA', y='SPENT', hue='Cluster', data=df_num, height=4)

In [None]:
# Time on Web vs Time on App
sns.relplot(x='ToW', y='ToA', hue='Cluster', data=df_num, height=4)

In [None]:
# Length of Membership vs Yearly Amount Spent
sns.relplot(x='MEM', y='SPENT', hue='Cluster', data=df_num, height=4)

In [None]:
# Time on App vs Length of Membership
sns.relplot(x='ToA', y='MEM', hue='Cluster', data=df_num, height=4)

In [None]:
# Time on Web vs Length of Membership
sns.relplot(x='ToW', y='MEM', hue='Cluster', data=df_num, height=4)

In [None]:
# Average Session Length vs Time on Web
sns.relplot(x='ASL', y='ToW', hue='Cluster', data=df_num, height=4)

In [None]:
# Average Session Length vs Time on App
sns.relplot(x='ASL', y='ToA', hue='Cluster', data=df_num, height=4)

In [None]:
# Distribution of SPENT within each cluster
sns.catplot(x='SPENT', y='Cluster', data=df_num, kind='boxen', height=4)

## Takeaways:
* Cluster 1 có Yearly Amount Spent lớn nhất, cũng chính là nhóm có Length of Membership nhiều nhất. Vậy việc kéo dài thời gian giữ chân khách hàng trong các chương trình Membership có thể giúp gia tăng mức chi tiêu hàng năm và tăng doanh thu
* Ngoài ra có thể thấy, các thành viên lâu năm trong Cluster 1 cũng có thời gian dùng app tương đối dài hơn so với những Cluster khác. Do vậy, có thể tập trung phát triển thêm việc tăng tương tác và kéo dài thời gian dùng app của khách hàng.

* Mặt khác, những nhóm có mức chi tiêu thấp như Cluster 3 thường có thời gian dùng app ngắn và thời gian trong các session cũng không dài. Vì vậy có thể tìm cách để tăng thời gian dùng app lên và có nhiều ưu đãi lâu dài để khuyến khích khách hàng gắn bó.

# TO-DO 4: DECISION TREE

## Feature Engineering

In [None]:
# Create a HighVal_Cust variable with only those with Yearly Amount Spent > 80% percentile
df1['HighVal_Cust'] = df1['SPENT'] >= df1.quantile(.8).SPENT
df1

In [None]:
# Get all numeric columns except SPENT column
x1 = df1.iloc[:,3:-2]

# Get High_Cust column only
y1 = df1['HighVal_Cust']

In [None]:
# Create train sets & test sets
X1_train, X1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.3, random_state=42)

In [None]:
y1_test

In [None]:
# Import label encoders
from sklearn import preprocessing
from sklearn import utils
# from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import KBinsDiscretizer
# from sklearn.compose import ColumnTransformer

In [None]:
# Encode variables with label encoding
encoder = preprocessing.LabelEncoder()

y1_train = encoder.fit_transform(y1_train)

#y1_test = encoder.fit_transform(y1_test)

columns_name = ['ASL', 'ToA', 'ToW', 'MEM']
for col in columns_name:
    X1_train[col] = encoder.fit_transform(X1_train[col])
    X1_test[col] = encoder.fit_transform(X1_test[col])

In [None]:
y1_test.head()

## Decision Tree Classifier with criterion gini index

In [None]:
# import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# instantiate the DecisionTreeClassifier model with criterion gini index
clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)

# fit the model
clf_gini.fit(X1_train, y1_train)

In [None]:
# Predict test set results with criterion gini index
y1_pred_gini = clf_gini.predict(X1_test)

In [None]:
# Check accuracy score with criterion gini index
from sklearn.metrics import accuracy_score

print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y1_test, y1_pred_gini)))

In [None]:
# Compare the train-set & test-set accuracy
y1_pred_train_gini = clf_gini.predict(X1_train)

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y1_train, y1_pred_train_gini)))

In [None]:
# Check for overfitting & underfitting

# Print The Scores
print('Training set score: {:.4f}'.format(clf_gini.score(X1_train, y1_train)))

print('Test set score: {:.4f}'.format(clf_gini.score(X1_test, y1_test)))

Since the training-set accuracy score is 0.9114 while the test-set accuracy to be 0.8533, there is no sign of overfitting.

In [None]:
# Visualize decision trees
plt.figure(figsize=(12,8))

from sklearn import tree

tree.plot_tree(clf_gini.fit(X1_train, y1_train))

In [None]:
import graphviz
dot_data = tree.export_graphviz(clf_gini, out_file=None, 
                              feature_names=X1_train.columns,  
                              class_names=['HighVal_Cust', 'Other_Cust'],  
                              filled=True, rounded=True,  
                              special_characters=True)

graph = graphviz.Source(dot_data)

graph

## Decision Tree Classifier with criterion entropy

In [None]:
# instantiate the DecisionTreeClassifier model with criterion entropy
clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)


# fit the model
clf_en.fit(X1_train, y1_train)

In [None]:
# Predict the Test set results with criterion entropy
y1_pred_en = clf_en.predict(X1_test)

In [None]:
# Check accuracy score with criterion entropy
print('Model accuracy score with criterion entropy: {0:0.4f}'. format(accuracy_score(y1_test, y1_pred_en)))

In [None]:
# Compare the train-set and test-set accuracy
y1_pred_train_en = clf_en.predict(X1_train)

y1_pred_train_en

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y1_train, y1_pred_train_en)))

In [None]:
# Check for overfitting and underfitting

# print the scores on training and test set

print('Training set score: {:.4f}'.format(clf_en.score(X1_train, y1_train)))

print('Test set score: {:.4f}'.format(clf_en.score(X1_test, y1_test)))

No sign of overfitting

In [None]:
# Visualize decision tree
plt.figure(figsize=(12,8))

tree.plot_tree(clf_en.fit(X1_train, y1_train)) 

In [None]:
# Visualize decision-trees with graphviz
import graphviz 
dot_data = tree.export_graphviz(clf_en, out_file=None, 
                              feature_names=X_train.columns,  
                              class_names=['HighVal_Cust', 'Other_Cust'],  
                              filled=True, rounded=True,  
                              special_characters=True)

graph = graphviz.Source(dot_data) 

graph